public inbox for [email protected]
 help / color / mirror / Atom feed
* [PATCH 1/2] io_uring: retain iov_iter state over io_read/io_write calls
  2020-08-13 17:56 [PATCHSET " Jens Axboe
@ 2020-08-13 17:56 ` Jens Axboe
  0 siblings, 0 replies; 15+ messages in thread
From: Jens Axboe @ 2020-08-13 17:56 UTC (permalink / raw)
  To: io-uring; +Cc: david, jmoyer, Jens Axboe

Instead of maintaining (and setting/remembering) iov_iter size and
segment counts, just put the iov_iter in the async part of the IO
structure.

This is mostly a preparation patch for doing appropriate internal retries
for short reads, but it also cleans up the state handling nicely and
simplifies it quite a bit.

Signed-off-by: Jens Axboe <[email protected]>
---
 fs/io_uring.c | 118 ++++++++++++++++++++++++--------------------------
 1 file changed, 56 insertions(+), 62 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1ec25ee71372..a20fccf91d76 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -508,9 +508,7 @@ struct io_async_msghdr {
 
 struct io_async_rw {
 	struct iovec			fast_iov[UIO_FASTIOV];
-	struct iovec			*iov;
-	ssize_t				nr_segs;
-	ssize_t				size;
+	struct iov_iter			iter;
 	struct wait_page_queue		wpq;
 };
 
@@ -915,9 +913,8 @@ static void io_file_put_work(struct work_struct *work);
 static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
 			       struct iovec **iovec, struct iov_iter *iter,
 			       bool needs_lock);
-static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
-			     struct iovec *iovec, struct iovec *fast_iov,
-			     struct iov_iter *iter);
+static int io_setup_async_rw(struct io_kiocb *req, struct iovec *iovec,
+			     struct iovec *fast_iov, struct iov_iter *iter);
 
 static struct kmem_cache *req_cachep;
 
@@ -2299,7 +2296,7 @@ static bool io_resubmit_prep(struct io_kiocb *req, int error)
 	ret = io_import_iovec(rw, req, &iovec, &iter, false);
 	if (ret < 0)
 		goto end_req;
-	ret = io_setup_async_rw(req, ret, iovec, inline_vecs, &iter);
+	ret = io_setup_async_rw(req, iovec, inline_vecs, &iter);
 	if (!ret)
 		return true;
 	kfree(iovec);
@@ -2830,6 +2827,13 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
 	if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))
 		return -EINVAL;
 
+	if (req->io) {
+		struct io_async_rw *iorw = &req->io->rw;
+
+		*iovec = NULL;
+		return iov_iter_count(&iorw->iter);
+	}
+
 	if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
 		if (req->flags & REQ_F_BUFFER_SELECT) {
 			buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
@@ -2845,14 +2849,6 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
 		return ret < 0 ? ret : sqe_len;
 	}
 
-	if (req->io) {
-		struct io_async_rw *iorw = &req->io->rw;
-
-		iov_iter_init(iter, rw, iorw->iov, iorw->nr_segs, iorw->size);
-		*iovec = NULL;
-		return iorw->size;
-	}
-
 	if (req->flags & REQ_F_BUFFER_SELECT) {
 		ret = io_iov_buffer_select(req, *iovec, needs_lock);
 		if (!ret) {
@@ -2930,21 +2926,19 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
 	return ret;
 }
 
-static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size,
-			  struct iovec *iovec, struct iovec *fast_iov,
-			  struct iov_iter *iter)
+static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
+			  struct iovec *fast_iov, struct iov_iter *iter)
 {
 	struct io_async_rw *rw = &req->io->rw;
 
-	rw->nr_segs = iter->nr_segs;
-	rw->size = io_size;
+	memcpy(&rw->iter, iter, sizeof(*iter));
 	if (!iovec) {
-		rw->iov = rw->fast_iov;
-		if (rw->iov != fast_iov)
-			memcpy(rw->iov, fast_iov,
+		rw->iter.iov = rw->fast_iov;
+		if (rw->iter.iov != fast_iov)
+			memcpy((void *) rw->iter.iov, fast_iov,
 			       sizeof(struct iovec) * iter->nr_segs);
 	} else {
-		rw->iov = iovec;
+		rw->iter.iov = iovec;
 		req->flags |= REQ_F_NEED_CLEANUP;
 	}
 }
@@ -2963,9 +2957,8 @@ static int io_alloc_async_ctx(struct io_kiocb *req)
 	return  __io_alloc_async_ctx(req);
 }
 
-static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
-			     struct iovec *iovec, struct iovec *fast_iov,
-			     struct iov_iter *iter)
+static int io_setup_async_rw(struct io_kiocb *req, struct iovec *iovec,
+			     struct iovec *fast_iov, struct iov_iter *iter)
 {
 	if (!io_op_defs[req->opcode].async_ctx)
 		return 0;
@@ -2973,7 +2966,7 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
 		if (__io_alloc_async_ctx(req))
 			return -ENOMEM;
 
-		io_req_map_rw(req, io_size, iovec, fast_iov, iter);
+		io_req_map_rw(req, iovec, fast_iov, iter);
 	}
 	return 0;
 }
@@ -2981,18 +2974,19 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
 static inline int io_rw_prep_async(struct io_kiocb *req, int rw,
 				   bool force_nonblock)
 {
-	struct io_async_ctx *io = req->io;
-	struct iov_iter iter;
+	struct io_async_rw *iorw = &req->io->rw;
 	ssize_t ret;
 
-	io->rw.iov = io->rw.fast_iov;
+	iorw->iter.iov = iorw->fast_iov;
+	/* reset ->io around the iovec import, we don't want to use it */
 	req->io = NULL;
-	ret = io_import_iovec(rw, req, &io->rw.iov, &iter, !force_nonblock);
-	req->io = io;
+	ret = io_import_iovec(rw, req, (struct iovec **) &iorw->iter.iov,
+				&iorw->iter, !force_nonblock);
+	req->io = container_of(iorw, struct io_async_ctx, rw);
 	if (unlikely(ret < 0))
 		return ret;
 
-	io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
+	io_req_map_rw(req, iorw->iter.iov, iorw->fast_iov, &iorw->iter);
 	return 0;
 }
 
@@ -3090,7 +3084,8 @@ static inline int kiocb_wait_page_queue_init(struct kiocb *kiocb,
  * succeed, or in rare cases where it fails, we then fall back to using the
  * async worker threads for a blocking retry.
  */
-static bool io_rw_should_retry(struct io_kiocb *req)
+static bool io_rw_should_retry(struct io_kiocb *req, struct iovec *iovec,
+			       struct iovec *fast_iov, struct iov_iter *iter)
 {
 	struct kiocb *kiocb = &req->rw.kiocb;
 	int ret;
@@ -3113,8 +3108,11 @@ static bool io_rw_should_retry(struct io_kiocb *req)
 	 * If request type doesn't require req->io to defer in general,
 	 * we need to allocate it here
 	 */
-	if (!req->io && __io_alloc_async_ctx(req))
-		return false;
+	if (!req->io) {
+		if (__io_alloc_async_ctx(req))
+			return false;
+		io_req_map_rw(req, iovec, fast_iov, iter);
+	}
 
 	ret = kiocb_wait_page_queue_init(kiocb, &req->io->rw.wpq,
 						io_async_buf_func, req);
@@ -3141,12 +3139,14 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 {
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 	struct kiocb *kiocb = &req->rw.kiocb;
-	struct iov_iter iter;
+	struct iov_iter __iter, *iter = &__iter;
 	size_t iov_count;
 	ssize_t io_size, ret, ret2;
-	unsigned long nr_segs;
 
-	ret = io_import_iovec(READ, req, &iovec, &iter, !force_nonblock);
+	if (req->io)
+		iter = &req->io->rw.iter;
+
+	ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
 	if (ret < 0)
 		return ret;
 	io_size = ret;
@@ -3160,30 +3160,26 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 	if (force_nonblock && !io_file_supports_async(req->file, READ))
 		goto copy_iov;
 
-	iov_count = iov_iter_count(&iter);
-	nr_segs = iter.nr_segs;
+	iov_count = iov_iter_count(iter);
 	ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count);
 	if (unlikely(ret))
 		goto out_free;
 
-	ret2 = io_iter_do_read(req, &iter);
+	ret2 = io_iter_do_read(req, iter);
 
 	/* Catch -EAGAIN return for forced non-blocking submission */
 	if (!force_nonblock || (ret2 != -EAGAIN && ret2 != -EIO)) {
 		kiocb_done(kiocb, ret2, cs);
 	} else {
-		iter.count = iov_count;
-		iter.nr_segs = nr_segs;
 copy_iov:
-		ret = io_setup_async_rw(req, io_size, iovec, inline_vecs,
-					&iter);
+		ret = io_setup_async_rw(req, iovec, inline_vecs, iter);
 		if (ret)
 			goto out_free;
 		/* it's copied and will be cleaned with ->io */
 		iovec = NULL;
 		/* if we can retry, do so with the callbacks armed */
-		if (io_rw_should_retry(req)) {
-			ret2 = io_iter_do_read(req, &iter);
+		if (io_rw_should_retry(req, iovec, inline_vecs, iter)) {
+			ret2 = io_iter_do_read(req, iter);
 			if (ret2 == -EIOCBQUEUED) {
 				goto out_free;
 			} else if (ret2 != -EAGAIN) {
@@ -3223,12 +3219,14 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
 {
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 	struct kiocb *kiocb = &req->rw.kiocb;
-	struct iov_iter iter;
+	struct iov_iter __iter, *iter = &__iter;
 	size_t iov_count;
 	ssize_t ret, ret2, io_size;
-	unsigned long nr_segs;
 
-	ret = io_import_iovec(WRITE, req, &iovec, &iter, !force_nonblock);
+	if (req->io)
+		iter = &req->io->rw.iter;
+
+	ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
 	if (ret < 0)
 		return ret;
 	io_size = ret;
@@ -3247,8 +3245,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
 	    (req->flags & REQ_F_ISREG))
 		goto copy_iov;
 
-	iov_count = iov_iter_count(&iter);
-	nr_segs = iter.nr_segs;
+	iov_count = iov_iter_count(iter);
 	ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count);
 	if (unlikely(ret))
 		goto out_free;
@@ -3269,9 +3266,9 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
 	kiocb->ki_flags |= IOCB_WRITE;
 
 	if (req->file->f_op->write_iter)
-		ret2 = call_write_iter(req->file, kiocb, &iter);
+		ret2 = call_write_iter(req->file, kiocb, iter);
 	else if (req->file->f_op->write)
-		ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter);
+		ret2 = loop_rw_iter(WRITE, req->file, kiocb, iter);
 	else
 		ret2 = -EINVAL;
 
@@ -3284,11 +3281,8 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
 	if (!force_nonblock || ret2 != -EAGAIN) {
 		kiocb_done(kiocb, ret2, cs);
 	} else {
-		iter.count = iov_count;
-		iter.nr_segs = nr_segs;
 copy_iov:
-		ret = io_setup_async_rw(req, io_size, iovec, inline_vecs,
-					&iter);
+		ret = io_setup_async_rw(req, iovec, inline_vecs, iter);
 		if (ret)
 			goto out_free;
 		/* it's copied and will be cleaned with ->io */
@@ -5583,8 +5577,8 @@ static void __io_clean_op(struct io_kiocb *req)
 		case IORING_OP_WRITEV:
 		case IORING_OP_WRITE_FIXED:
 		case IORING_OP_WRITE:
-			if (io->rw.iov != io->rw.fast_iov)
-				kfree(io->rw.iov);
+			if (io->rw.iter.iov != io->rw.fast_iov)
+				kfree(io->rw.iter.iov);
 			break;
 		case IORING_OP_RECVMSG:
 		case IORING_OP_SENDMSG:
-- 
2.28.0


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCHSET v2 0/2] io_uring: handle short reads internally
@ 2020-08-14 19:54 Jens Axboe
  2020-08-14 19:54 ` [PATCH 1/2] io_uring: retain iov_iter state over io_read/io_write calls Jens Axboe
                   ` (2 more replies)
  0 siblings, 3 replies; 15+ messages in thread
From: Jens Axboe @ 2020-08-14 19:54 UTC (permalink / raw)
  To: io-uring; +Cc: david, jmoyer

Hi,

Since we've had a few cases of applications not dealing with this
appopriately, I believe the safest course of action is to ensure that
we don't return short reads when we really don't have to.

The first patch is just a prep patch that retains iov_iter state over
retries, while the second one actually enables just doing retries if
we get a short read back.

This passes all my testing, both liburing regression tests but also
tests that explicitly trigger internal short reads and hence retry
based on current state. No short reads are passed back to the
application.

Changes since v1:

- Fixed an issue with fixed/registered buffers
- Rewrite io_read() code flow to be a lot more readable
- Add comments

-- 
Jens Axboe



^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 1/2] io_uring: retain iov_iter state over io_read/io_write calls
  2020-08-14 19:54 [PATCHSET v2 0/2] io_uring: handle short reads internally Jens Axboe
@ 2020-08-14 19:54 ` Jens Axboe
  2020-08-14 19:54 ` [PATCH 2/2] io_uring: internally retry short reads Jens Axboe
  2020-08-17  9:25 ` [PATCHSET v2 0/2] io_uring: handle short reads internally Stefan Metzmacher
  2 siblings, 0 replies; 15+ messages in thread
From: Jens Axboe @ 2020-08-14 19:54 UTC (permalink / raw)
  To: io-uring; +Cc: david, jmoyer, Jens Axboe

Instead of maintaining (and setting/remembering) iov_iter size and
segment counts, just put the iov_iter in the async part of the IO
structure.

This is mostly a preparation patch for doing appropriate internal retries
for short reads, but it also cleans up the state handling nicely and
simplifies it quite a bit.

Signed-off-by: Jens Axboe <[email protected]>
---
 fs/io_uring.c | 114 ++++++++++++++++++++++++--------------------------
 1 file changed, 54 insertions(+), 60 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1ec25ee71372..409cfa1d6d90 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -508,9 +508,7 @@ struct io_async_msghdr {
 
 struct io_async_rw {
 	struct iovec			fast_iov[UIO_FASTIOV];
-	struct iovec			*iov;
-	ssize_t				nr_segs;
-	ssize_t				size;
+	struct iov_iter			iter;
 	struct wait_page_queue		wpq;
 };
 
@@ -915,8 +913,7 @@ static void io_file_put_work(struct work_struct *work);
 static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
 			       struct iovec **iovec, struct iov_iter *iter,
 			       bool needs_lock);
-static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
-			     struct iovec *iovec, struct iovec *fast_iov,
+static int io_setup_async_rw(struct io_kiocb *req, struct iovec *iovec,
 			     struct iov_iter *iter);
 
 static struct kmem_cache *req_cachep;
@@ -2299,7 +2296,7 @@ static bool io_resubmit_prep(struct io_kiocb *req, int error)
 	ret = io_import_iovec(rw, req, &iovec, &iter, false);
 	if (ret < 0)
 		goto end_req;
-	ret = io_setup_async_rw(req, ret, iovec, inline_vecs, &iter);
+	ret = io_setup_async_rw(req, iovec, &iter);
 	if (!ret)
 		return true;
 	kfree(iovec);
@@ -2820,6 +2817,13 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
 	ssize_t ret;
 	u8 opcode;
 
+	if (req->io) {
+		struct io_async_rw *iorw = &req->io->rw;
+
+		*iovec = NULL;
+		return iov_iter_count(&iorw->iter);
+	}
+
 	opcode = req->opcode;
 	if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
 		*iovec = NULL;
@@ -2845,14 +2849,6 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
 		return ret < 0 ? ret : sqe_len;
 	}
 
-	if (req->io) {
-		struct io_async_rw *iorw = &req->io->rw;
-
-		iov_iter_init(iter, rw, iorw->iov, iorw->nr_segs, iorw->size);
-		*iovec = NULL;
-		return iorw->size;
-	}
-
 	if (req->flags & REQ_F_BUFFER_SELECT) {
 		ret = io_iov_buffer_select(req, *iovec, needs_lock);
 		if (!ret) {
@@ -2930,21 +2926,19 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
 	return ret;
 }
 
-static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size,
-			  struct iovec *iovec, struct iovec *fast_iov,
+static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
 			  struct iov_iter *iter)
 {
 	struct io_async_rw *rw = &req->io->rw;
 
-	rw->nr_segs = iter->nr_segs;
-	rw->size = io_size;
+	memcpy(&rw->iter, iter, sizeof(*iter));
 	if (!iovec) {
-		rw->iov = rw->fast_iov;
-		if (rw->iov != fast_iov)
-			memcpy(rw->iov, fast_iov,
+		rw->iter.iov = rw->fast_iov;
+		if (rw->fast_iov != iter->iov)
+			memcpy(rw->fast_iov, iter->iov,
 			       sizeof(struct iovec) * iter->nr_segs);
 	} else {
-		rw->iov = iovec;
+		rw->iter.iov = iovec;
 		req->flags |= REQ_F_NEED_CLEANUP;
 	}
 }
@@ -2963,8 +2957,7 @@ static int io_alloc_async_ctx(struct io_kiocb *req)
 	return  __io_alloc_async_ctx(req);
 }
 
-static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
-			     struct iovec *iovec, struct iovec *fast_iov,
+static int io_setup_async_rw(struct io_kiocb *req, struct iovec *iovec,
 			     struct iov_iter *iter)
 {
 	if (!io_op_defs[req->opcode].async_ctx)
@@ -2973,7 +2966,7 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
 		if (__io_alloc_async_ctx(req))
 			return -ENOMEM;
 
-		io_req_map_rw(req, io_size, iovec, fast_iov, iter);
+		io_req_map_rw(req, iovec, iter);
 	}
 	return 0;
 }
@@ -2981,18 +2974,19 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
 static inline int io_rw_prep_async(struct io_kiocb *req, int rw,
 				   bool force_nonblock)
 {
-	struct io_async_ctx *io = req->io;
-	struct iov_iter iter;
+	struct io_async_rw *iorw = &req->io->rw;
 	ssize_t ret;
 
-	io->rw.iov = io->rw.fast_iov;
+	iorw->iter.iov = iorw->fast_iov;
+	/* reset ->io around the iovec import, we don't want to use it */
 	req->io = NULL;
-	ret = io_import_iovec(rw, req, &io->rw.iov, &iter, !force_nonblock);
-	req->io = io;
+	ret = io_import_iovec(rw, req, (struct iovec **) &iorw->iter.iov,
+				&iorw->iter, !force_nonblock);
+	req->io = container_of(iorw, struct io_async_ctx, rw);
 	if (unlikely(ret < 0))
 		return ret;
 
-	io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
+	io_req_map_rw(req, iorw->iter.iov, &iorw->iter);
 	return 0;
 }
 
@@ -3090,7 +3084,8 @@ static inline int kiocb_wait_page_queue_init(struct kiocb *kiocb,
  * succeed, or in rare cases where it fails, we then fall back to using the
  * async worker threads for a blocking retry.
  */
-static bool io_rw_should_retry(struct io_kiocb *req)
+static bool io_rw_should_retry(struct io_kiocb *req, struct iovec *iovec,
+			       struct iovec *fast_iov, struct iov_iter *iter)
 {
 	struct kiocb *kiocb = &req->rw.kiocb;
 	int ret;
@@ -3113,8 +3108,11 @@ static bool io_rw_should_retry(struct io_kiocb *req)
 	 * If request type doesn't require req->io to defer in general,
 	 * we need to allocate it here
 	 */
-	if (!req->io && __io_alloc_async_ctx(req))
-		return false;
+	if (!req->io) {
+		if (__io_alloc_async_ctx(req))
+			return false;
+		io_req_map_rw(req, iovec, iter);
+	}
 
 	ret = kiocb_wait_page_queue_init(kiocb, &req->io->rw.wpq,
 						io_async_buf_func, req);
@@ -3141,12 +3139,14 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 {
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 	struct kiocb *kiocb = &req->rw.kiocb;
-	struct iov_iter iter;
+	struct iov_iter __iter, *iter = &__iter;
 	size_t iov_count;
-	ssize_t io_size, ret, ret2;
-	unsigned long nr_segs;
+	ssize_t io_size, ret, ret2 = 0;
+
+	if (req->io)
+		iter = &req->io->rw.iter;
 
-	ret = io_import_iovec(READ, req, &iovec, &iter, !force_nonblock);
+	ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
 	if (ret < 0)
 		return ret;
 	io_size = ret;
@@ -3160,30 +3160,26 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 	if (force_nonblock && !io_file_supports_async(req->file, READ))
 		goto copy_iov;
 
-	iov_count = iov_iter_count(&iter);
-	nr_segs = iter.nr_segs;
+	iov_count = iov_iter_count(iter);
 	ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count);
 	if (unlikely(ret))
 		goto out_free;
 
-	ret2 = io_iter_do_read(req, &iter);
+	ret2 = io_iter_do_read(req, iter);
 
 	/* Catch -EAGAIN return for forced non-blocking submission */
 	if (!force_nonblock || (ret2 != -EAGAIN && ret2 != -EIO)) {
 		kiocb_done(kiocb, ret2, cs);
 	} else {
-		iter.count = iov_count;
-		iter.nr_segs = nr_segs;
 copy_iov:
-		ret = io_setup_async_rw(req, io_size, iovec, inline_vecs,
-					&iter);
+		ret = io_setup_async_rw(req, iovec, iter);
 		if (ret)
 			goto out_free;
 		/* it's copied and will be cleaned with ->io */
 		iovec = NULL;
 		/* if we can retry, do so with the callbacks armed */
-		if (io_rw_should_retry(req)) {
-			ret2 = io_iter_do_read(req, &iter);
+		if (io_rw_should_retry(req, iovec, inline_vecs, iter)) {
+			ret2 = io_iter_do_read(req, iter);
 			if (ret2 == -EIOCBQUEUED) {
 				goto out_free;
 			} else if (ret2 != -EAGAIN) {
@@ -3223,12 +3219,14 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
 {
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 	struct kiocb *kiocb = &req->rw.kiocb;
-	struct iov_iter iter;
+	struct iov_iter __iter, *iter = &__iter;
 	size_t iov_count;
 	ssize_t ret, ret2, io_size;
-	unsigned long nr_segs;
 
-	ret = io_import_iovec(WRITE, req, &iovec, &iter, !force_nonblock);
+	if (req->io)
+		iter = &req->io->rw.iter;
+
+	ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
 	if (ret < 0)
 		return ret;
 	io_size = ret;
@@ -3247,8 +3245,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
 	    (req->flags & REQ_F_ISREG))
 		goto copy_iov;
 
-	iov_count = iov_iter_count(&iter);
-	nr_segs = iter.nr_segs;
+	iov_count = iov_iter_count(iter);
 	ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count);
 	if (unlikely(ret))
 		goto out_free;
@@ -3269,9 +3266,9 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
 	kiocb->ki_flags |= IOCB_WRITE;
 
 	if (req->file->f_op->write_iter)
-		ret2 = call_write_iter(req->file, kiocb, &iter);
+		ret2 = call_write_iter(req->file, kiocb, iter);
 	else if (req->file->f_op->write)
-		ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter);
+		ret2 = loop_rw_iter(WRITE, req->file, kiocb, iter);
 	else
 		ret2 = -EINVAL;
 
@@ -3284,11 +3281,8 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
 	if (!force_nonblock || ret2 != -EAGAIN) {
 		kiocb_done(kiocb, ret2, cs);
 	} else {
-		iter.count = iov_count;
-		iter.nr_segs = nr_segs;
 copy_iov:
-		ret = io_setup_async_rw(req, io_size, iovec, inline_vecs,
-					&iter);
+		ret = io_setup_async_rw(req, iovec, iter);
 		if (ret)
 			goto out_free;
 		/* it's copied and will be cleaned with ->io */
@@ -5583,8 +5577,8 @@ static void __io_clean_op(struct io_kiocb *req)
 		case IORING_OP_WRITEV:
 		case IORING_OP_WRITE_FIXED:
 		case IORING_OP_WRITE:
-			if (io->rw.iov != io->rw.fast_iov)
-				kfree(io->rw.iov);
+			if (io->rw.iter.iov != io->rw.fast_iov)
+				kfree(io->rw.iter.iov);
 			break;
 		case IORING_OP_RECVMSG:
 		case IORING_OP_SENDMSG:
-- 
2.28.0


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 2/2] io_uring: internally retry short reads
  2020-08-14 19:54 [PATCHSET v2 0/2] io_uring: handle short reads internally Jens Axboe
  2020-08-14 19:54 ` [PATCH 1/2] io_uring: retain iov_iter state over io_read/io_write calls Jens Axboe
@ 2020-08-14 19:54 ` Jens Axboe
  2020-08-17  9:25 ` [PATCHSET v2 0/2] io_uring: handle short reads internally Stefan Metzmacher
  2 siblings, 0 replies; 15+ messages in thread
From: Jens Axboe @ 2020-08-14 19:54 UTC (permalink / raw)
  To: io-uring; +Cc: david, jmoyer, Jens Axboe

We've had a few application cases of not handling short reads properly,
and it is understandable as short reads aren't really expected if the
application isn't doing non-blocking IO.

Now that we retain the iov_iter over retries, we can implement internal
retry pretty trivially. This ensures that we don't return a short read,
even for buffered reads on page cache conflicts.

Cleanup the deep nesting and hard to read nature of io_read() as well,
it's much more straight forward now to read and understand. Added a
few comments explaining the logic as well.

Signed-off-by: Jens Axboe <[email protected]>
---
 fs/io_uring.c | 111 ++++++++++++++++++++++++++++++++------------------
 1 file changed, 71 insertions(+), 40 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 409cfa1d6d90..e1aacc0b9bd0 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -509,6 +509,7 @@ struct io_async_msghdr {
 struct io_async_rw {
 	struct iovec			fast_iov[UIO_FASTIOV];
 	struct iov_iter			iter;
+	size_t				bytes_done;
 	struct wait_page_queue		wpq;
 };
 
@@ -914,7 +915,7 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
 			       struct iovec **iovec, struct iov_iter *iter,
 			       bool needs_lock);
 static int io_setup_async_rw(struct io_kiocb *req, struct iovec *iovec,
-			     struct iov_iter *iter);
+			     struct iov_iter *iter, bool force);
 
 static struct kmem_cache *req_cachep;
 
@@ -2296,7 +2297,7 @@ static bool io_resubmit_prep(struct io_kiocb *req, int error)
 	ret = io_import_iovec(rw, req, &iovec, &iter, false);
 	if (ret < 0)
 		goto end_req;
-	ret = io_setup_async_rw(req, iovec, &iter);
+	ret = io_setup_async_rw(req, iovec, &iter, false);
 	if (!ret)
 		return true;
 	kfree(iovec);
@@ -2586,6 +2587,14 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
 {
 	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
 
+	/* add previously done IO, if any */
+	if (req->io && req->io->rw.bytes_done > 0) {
+		if (ret < 0)
+			ret = req->io->rw.bytes_done;
+		else
+			ret += req->io->rw.bytes_done;
+	}
+
 	if (req->flags & REQ_F_CUR_POS)
 		req->file->f_pos = kiocb->ki_pos;
 	if (ret >= 0 && kiocb->ki_complete == io_complete_rw)
@@ -2932,6 +2941,7 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
 	struct io_async_rw *rw = &req->io->rw;
 
 	memcpy(&rw->iter, iter, sizeof(*iter));
+	rw->bytes_done = 0;
 	if (!iovec) {
 		rw->iter.iov = rw->fast_iov;
 		if (rw->fast_iov != iter->iov)
@@ -2958,9 +2968,9 @@ static int io_alloc_async_ctx(struct io_kiocb *req)
 }
 
 static int io_setup_async_rw(struct io_kiocb *req, struct iovec *iovec,
-			     struct iov_iter *iter)
+			     struct iov_iter *iter, bool force)
 {
-	if (!io_op_defs[req->opcode].async_ctx)
+	if (!force && !io_op_defs[req->opcode].async_ctx)
 		return 0;
 	if (!req->io) {
 		if (__io_alloc_async_ctx(req))
@@ -3084,8 +3094,7 @@ static inline int kiocb_wait_page_queue_init(struct kiocb *kiocb,
  * succeed, or in rare cases where it fails, we then fall back to using the
  * async worker threads for a blocking retry.
  */
-static bool io_rw_should_retry(struct io_kiocb *req, struct iovec *iovec,
-			       struct iovec *fast_iov, struct iov_iter *iter)
+static bool io_rw_should_retry(struct io_kiocb *req)
 {
 	struct kiocb *kiocb = &req->rw.kiocb;
 	int ret;
@@ -3094,8 +3103,8 @@ static bool io_rw_should_retry(struct io_kiocb *req, struct iovec *iovec,
 	if (req->flags & REQ_F_NOWAIT)
 		return false;
 
-	/* already tried, or we're doing O_DIRECT */
-	if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_WAITQ))
+	/* Only for buffered IO */
+	if (kiocb->ki_flags & IOCB_DIRECT)
 		return false;
 	/*
 	 * just use poll if we can, and don't attempt if the fs doesn't
@@ -3104,16 +3113,6 @@ static bool io_rw_should_retry(struct io_kiocb *req, struct iovec *iovec,
 	if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
 		return false;
 
-	/*
-	 * If request type doesn't require req->io to defer in general,
-	 * we need to allocate it here
-	 */
-	if (!req->io) {
-		if (__io_alloc_async_ctx(req))
-			return false;
-		io_req_map_rw(req, iovec, iter);
-	}
-
 	ret = kiocb_wait_page_queue_init(kiocb, &req->io->rw.wpq,
 						io_async_buf_func, req);
 	if (!ret) {
@@ -3140,8 +3139,8 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 	struct kiocb *kiocb = &req->rw.kiocb;
 	struct iov_iter __iter, *iter = &__iter;
+	ssize_t io_size, ret, ret2;
 	size_t iov_count;
-	ssize_t io_size, ret, ret2 = 0;
 
 	if (req->io)
 		iter = &req->io->rw.iter;
@@ -3151,6 +3150,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 		return ret;
 	io_size = ret;
 	req->result = io_size;
+	ret = 0;
 
 	/* Ensure we clear previously set non-block flag */
 	if (!force_nonblock)
@@ -3165,31 +3165,62 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 	if (unlikely(ret))
 		goto out_free;
 
-	ret2 = io_iter_do_read(req, iter);
+	ret = io_iter_do_read(req, iter);
 
-	/* Catch -EAGAIN return for forced non-blocking submission */
-	if (!force_nonblock || (ret2 != -EAGAIN && ret2 != -EIO)) {
-		kiocb_done(kiocb, ret2, cs);
-	} else {
+	if (!ret) {
+		goto done;
+	} else if (ret == -EIOCBQUEUED) {
+		ret = 0;
+		goto out_free;
+	} else if (ret == -EAGAIN) {
+		ret2 = io_setup_async_rw(req, iovec, iter, false);
+		if (ret2 < 0)
+			ret = ret2;
+		goto out_free;
+	} else if (ret < 0) {
+		goto out_free;
+	}
+
+	/* read it all, or we did blocking attempt. no retry. */
+	if (!iov_iter_count(iter) || !force_nonblock)
+		goto done;
+
+	io_size -= ret;
 copy_iov:
-		ret = io_setup_async_rw(req, iovec, iter);
-		if (ret)
-			goto out_free;
-		/* it's copied and will be cleaned with ->io */
-		iovec = NULL;
-		/* if we can retry, do so with the callbacks armed */
-		if (io_rw_should_retry(req, iovec, inline_vecs, iter)) {
-			ret2 = io_iter_do_read(req, iter);
-			if (ret2 == -EIOCBQUEUED) {
-				goto out_free;
-			} else if (ret2 != -EAGAIN) {
-				kiocb_done(kiocb, ret2, cs);
-				goto out_free;
-			}
-		}
+	ret2 = io_setup_async_rw(req, iovec, iter, true);
+	if (ret2) {
+		ret = ret2;
+		goto out_free;
+	}
+	/* it's copied and will be cleaned with ->io */
+	iovec = NULL;
+	/* now use our persistent iterator, if we aren't already */
+	iter = &req->io->rw.iter;
+retry:
+	req->io->rw.bytes_done += ret;
+	/* if we can retry, do so with the callbacks armed */
+	if (!io_rw_should_retry(req)) {
 		kiocb->ki_flags &= ~IOCB_WAITQ;
 		return -EAGAIN;
 	}
+
+	/*
+	 * Now retry read with the IOCB_WAITQ parts set in the iocb. If we
+	 * get -EIOCBQUEUED, then we'll get a notification when the desired
+	 * page gets unlocked. We can also get a partial read here, and if we
+	 * do, then just retry at the new offset.
+	 */
+	ret = io_iter_do_read(req, iter);
+	if (ret == -EIOCBQUEUED) {
+		ret = 0;
+		goto out_free;
+	} else if (ret > 0 && ret < io_size) {
+		/* we got some bytes, but not all. retry. */
+		goto retry;
+	}
+done:
+	kiocb_done(kiocb, ret, cs);
+	ret = 0;
 out_free:
 	if (iovec)
 		kfree(iovec);
@@ -3282,7 +3313,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
 		kiocb_done(kiocb, ret2, cs);
 	} else {
 copy_iov:
-		ret = io_setup_async_rw(req, iovec, iter);
+		ret = io_setup_async_rw(req, iovec, iter, false);
 		if (ret)
 			goto out_free;
 		/* it's copied and will be cleaned with ->io */
-- 
2.28.0


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCHSET v2 0/2] io_uring: handle short reads internally
  2020-08-14 19:54 [PATCHSET v2 0/2] io_uring: handle short reads internally Jens Axboe
  2020-08-14 19:54 ` [PATCH 1/2] io_uring: retain iov_iter state over io_read/io_write calls Jens Axboe
  2020-08-14 19:54 ` [PATCH 2/2] io_uring: internally retry short reads Jens Axboe
@ 2020-08-17  9:25 ` Stefan Metzmacher
  2020-08-18  3:29   ` Jens Axboe
  2 siblings, 1 reply; 15+ messages in thread
From: Stefan Metzmacher @ 2020-08-17  9:25 UTC (permalink / raw)
  To: Jens Axboe, io-uring; +Cc: david, jmoyer


[-- Attachment #1.1: Type: text/plain, Size: 880 bytes --]

Hi Jens,

> Since we've had a few cases of applications not dealing with this
> appopriately, I believe the safest course of action is to ensure that
> we don't return short reads when we really don't have to.
>
> The first patch is just a prep patch that retains iov_iter state over
> retries, while the second one actually enables just doing retries if
> we get a short read back.
> 
> This passes all my testing, both liburing regression tests but also
> tests that explicitly trigger internal short reads and hence retry
> based on current state. No short reads are passed back to the
> application.

Thanks! I was going to ask about exactly that :-)

It wasn't clear why returning short reads were justified by resulting
in better performance... As it means the application needs to do
a lot more work and syscalls.

Will this be backported?

metze


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCHSET v2 0/2] io_uring: handle short reads internally
  2020-08-17  9:25 ` [PATCHSET v2 0/2] io_uring: handle short reads internally Stefan Metzmacher
@ 2020-08-18  3:29   ` Jens Axboe
  2020-08-18  4:12     ` Jens Axboe
  0 siblings, 1 reply; 15+ messages in thread
From: Jens Axboe @ 2020-08-18  3:29 UTC (permalink / raw)
  To: Stefan Metzmacher, io-uring; +Cc: david, jmoyer

On 8/17/20 2:25 AM, Stefan Metzmacher wrote:
> Hi Jens,
> 
>> Since we've had a few cases of applications not dealing with this
>> appopriately, I believe the safest course of action is to ensure that
>> we don't return short reads when we really don't have to.
>>
>> The first patch is just a prep patch that retains iov_iter state over
>> retries, while the second one actually enables just doing retries if
>> we get a short read back.
>>
>> This passes all my testing, both liburing regression tests but also
>> tests that explicitly trigger internal short reads and hence retry
>> based on current state. No short reads are passed back to the
>> application.
> 
> Thanks! I was going to ask about exactly that :-)
> 
> It wasn't clear why returning short reads were justified by resulting
> in better performance... As it means the application needs to do
> a lot more work and syscalls.

It mostly boils down to figuring out a good way to do it. With the
task_work based retry, the async buffered reads, we're almost there and
the prep patch adds the last remaining bits to retain the iov_iter state
across issues.

> Will this be backported?

I can, but not really in an efficient manner. It depends on the async
buffered work to make progress, and the task_work handling retry. The
latter means it's 5.7+, while the former is only in 5.9+...

We can make it work for earlier kernels by just using the thread offload
for that, and that may be worth doing. That would enable it in
5.7-stable and 5.8-stable. For that, you just need these two patches.
Patch 1 would work as-is, while patch 2 would need a small bit of
massaging since io_read() doesn't have the retry parts.

I'll give it a whirl just out of curiosity, then we can debate it after
that.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCHSET v2 0/2] io_uring: handle short reads internally
  2020-08-18  3:29   ` Jens Axboe
@ 2020-08-18  4:12     ` Jens Axboe
  2020-08-18  4:30       ` Jens Axboe
  0 siblings, 1 reply; 15+ messages in thread
From: Jens Axboe @ 2020-08-18  4:12 UTC (permalink / raw)
  To: Stefan Metzmacher, io-uring; +Cc: david, jmoyer

[-- Attachment #1: Type: text/plain, Size: 2416 bytes --]

On 8/17/20 8:29 PM, Jens Axboe wrote:
> On 8/17/20 2:25 AM, Stefan Metzmacher wrote:
>> Hi Jens,
>>
>>> Since we've had a few cases of applications not dealing with this
>>> appopriately, I believe the safest course of action is to ensure that
>>> we don't return short reads when we really don't have to.
>>>
>>> The first patch is just a prep patch that retains iov_iter state over
>>> retries, while the second one actually enables just doing retries if
>>> we get a short read back.
>>>
>>> This passes all my testing, both liburing regression tests but also
>>> tests that explicitly trigger internal short reads and hence retry
>>> based on current state. No short reads are passed back to the
>>> application.
>>
>> Thanks! I was going to ask about exactly that :-)
>>
>> It wasn't clear why returning short reads were justified by resulting
>> in better performance... As it means the application needs to do
>> a lot more work and syscalls.
> 
> It mostly boils down to figuring out a good way to do it. With the
> task_work based retry, the async buffered reads, we're almost there and
> the prep patch adds the last remaining bits to retain the iov_iter state
> across issues.
> 
>> Will this be backported?
> 
> I can, but not really in an efficient manner. It depends on the async
> buffered work to make progress, and the task_work handling retry. The
> latter means it's 5.7+, while the former is only in 5.9+...
> 
> We can make it work for earlier kernels by just using the thread offload
> for that, and that may be worth doing. That would enable it in
> 5.7-stable and 5.8-stable. For that, you just need these two patches.
> Patch 1 would work as-is, while patch 2 would need a small bit of
> massaging since io_read() doesn't have the retry parts.
> 
> I'll give it a whirl just out of curiosity, then we can debate it after
> that.

Here are the two patches against latest 5.7-stable (the rc branch, as
we had quite a few queued up after 5.9-rc1). Totally untested, just
wanted to see if it was doable.

First patch is mostly just applied, with various bits removed that we
don't have in 5.7. The second patch just does -EAGAIN punt for the
short read case, which will queue the remainder with io-wq for
async execution.

Obviously needs quite a bit of testing before it can go anywhere else,
but wanted to throw this out there in case you were interested in
giving it a go...

-- 
Jens Axboe


[-- Attachment #2: 0002-io_uring-internally-retry-short-reads.patch --]
[-- Type: text/x-patch, Size: 4389 bytes --]

From 14333b5b72f2d9f9e12d2abe72bda4ee6cc3e45c Mon Sep 17 00:00:00 2001
From: Jens Axboe <[email protected]>
Date: Mon, 17 Aug 2020 21:03:27 -0700
Subject: [PATCH 2/2] io_uring: internally retry short reads

We've had a few application cases of not handling short reads properly,
and it is understandable as short reads aren't really expected if the
application isn't doing non-blocking IO.

Now that we retain the iov_iter over retries, we can implement internal
retry pretty trivially. This ensures that we don't return a short read,
even for buffered reads on page cache conflicts.

Cleanup the deep nesting and hard to read nature of io_read() as well,
it's much more straight forward now to read and understand. Added a
few comments explaining the logic as well.

Signed-off-by: Jens Axboe <[email protected]>
---
 fs/io_uring.c | 61 +++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 45 insertions(+), 16 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index bdb88146b285..618a4ca29159 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -487,6 +487,7 @@ struct io_async_rw {
 	struct iovec			fast_iov[UIO_FASTIOV];
 	const struct iovec		*free_iovec;
 	struct iov_iter			iter;
+	size_t				bytes_done;
 };
 
 struct io_async_ctx {
@@ -2160,6 +2161,14 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret)
 {
 	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
 
+	/* add previously done IO, if any */
+	if (req->io && req->io->rw.bytes_done > 0) {
+		if (ret < 0)
+			ret = req->io->rw.bytes_done;
+		else
+			ret += req->io->rw.bytes_done;
+	}
+
 	if (req->flags & REQ_F_CUR_POS)
 		req->file->f_pos = kiocb->ki_pos;
 	if (ret >= 0 && kiocb->ki_complete == io_complete_rw)
@@ -2507,6 +2516,7 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
 
 	memcpy(&rw->iter, iter, sizeof(*iter));
 	rw->free_iovec = NULL;
+	rw->bytes_done = 0;
 	/* can only be fixed buffers, no need to do anything */
 	if (iter->type == ITER_BVEC)
 		return;
@@ -2543,9 +2553,9 @@ static int io_alloc_async_ctx(struct io_kiocb *req)
 
 static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
 			     const struct iovec *fast_iov,
-			     struct iov_iter *iter)
+			     struct iov_iter *iter, bool force)
 {
-	if (!io_op_defs[req->opcode].async_ctx)
+	if (!force && !io_op_defs[req->opcode].async_ctx)
 		return 0;
 	if (!req->io) {
 		if (__io_alloc_async_ctx(req))
@@ -2608,6 +2618,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock)
 
 	req->result = 0;
 	io_size = ret;
+	ret = 0;
 	if (req->flags & REQ_F_LINK_HEAD)
 		req->result = io_size;
 
@@ -2630,21 +2641,39 @@ static int io_read(struct io_kiocb *req, bool force_nonblock)
 		else
 			ret2 = -EINVAL;
 
-		/* Catch -EAGAIN return for forced non-blocking submission */
-		if (!force_nonblock || ret2 != -EAGAIN) {
-			kiocb_done(kiocb, ret2);
-		} else {
-copy_iov:
-			ret = io_setup_async_rw(req, iovec, inline_vecs, iter);
-			if (ret)
-				goto out_free;
-			/* any defer here is final, must blocking retry */
-			if (!(req->flags & REQ_F_NOWAIT) &&
-			    !file_can_poll(req->file))
-				req->flags |= REQ_F_MUST_PUNT;
-			return -EAGAIN;
+		if (!ret2) {
+			goto done;
+		} else if (ret2 == -EIOCBQUEUED) {
+			ret = 0;
+			goto out_free;
+		} else if (ret2 == -EAGAIN) {
+			ret2 = 0;
+			goto copy_iov;
+		} else if (ret2 < 0) {
+			ret = ret2;
+			goto out_free;
+		}
+
+		/* read it all, or we did blocking attempt. no retry. */
+		if (!iov_iter_count(iter) || !force_nonblock) {
+			ret = ret2;
+			goto done;
 		}
+
+copy_iov:
+		ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
+		if (ret)
+			goto out_free;
+		req->io->rw.bytes_done += ret2;
+		/* any defer here is final, must blocking retry */
+		if (!(req->flags & REQ_F_NOWAIT) &&
+		    !file_can_poll(req->file))
+			req->flags |= REQ_F_MUST_PUNT;
+		return -EAGAIN;
 	}
+done:
+	kiocb_done(kiocb, ret);
+	ret = 0;
 out_free:
 	if (!(req->flags & REQ_F_NEED_CLEANUP))
 		kfree(iovec);
@@ -2762,7 +2791,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock)
 			kiocb_done(kiocb, ret2);
 		} else {
 copy_iov:
-			ret = io_setup_async_rw(req, iovec, inline_vecs, iter);
+			ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
 			if (ret)
 				goto out_free;
 			/* any defer here is final, must blocking retry */
-- 
2.28.0


[-- Attachment #3: 0001-io_uring-retain-iov_iter-state-over-io_read-io_write.patch --]
[-- Type: text/x-patch, Size: 9660 bytes --]

From a8f27d6bdc9d5eb3e0180c0389c61bba35474ef5 Mon Sep 17 00:00:00 2001
From: Jens Axboe <[email protected]>
Date: Mon, 17 Aug 2020 20:55:27 -0700
Subject: [PATCH 1/2] io_uring: retain iov_iter state over io_read/io_write
 calls

Instead of maintaining (and setting/remembering) iov_iter size and
segment counts, just put the iov_iter in the async part of the IO
structure.

This is mostly a preparation patch for doing appropriate internal retries
for short reads, but it also cleans up the state handling nicely and
simplifies it quite a bit.

Signed-off-by: Jens Axboe <[email protected]>
---
 fs/io_uring.c | 123 +++++++++++++++++++++++++++-----------------------
 1 file changed, 67 insertions(+), 56 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 2e7cbe61f64c..bdb88146b285 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -485,9 +485,8 @@ struct io_async_msghdr {
 
 struct io_async_rw {
 	struct iovec			fast_iov[UIO_FASTIOV];
-	struct iovec			*iov;
-	ssize_t				nr_segs;
-	ssize_t				size;
+	const struct iovec		*free_iovec;
+	struct iov_iter			iter;
 };
 
 struct io_async_ctx {
@@ -2392,6 +2391,13 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
 	ssize_t ret;
 	u8 opcode;
 
+	if (req->io) {
+		struct io_async_rw *iorw = &req->io->rw;
+
+		*iovec = NULL;
+		return iov_iter_count(&iorw->iter);
+	}
+
 	opcode = req->opcode;
 	if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
 		*iovec = NULL;
@@ -2417,16 +2423,6 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
 		return ret < 0 ? ret : sqe_len;
 	}
 
-	if (req->io) {
-		struct io_async_rw *iorw = &req->io->rw;
-
-		*iovec = iorw->iov;
-		iov_iter_init(iter, rw, *iovec, iorw->nr_segs, iorw->size);
-		if (iorw->iov == iorw->fast_iov)
-			*iovec = NULL;
-		return iorw->size;
-	}
-
 	if (req->flags & REQ_F_BUFFER_SELECT) {
 		ret = io_iov_buffer_select(req, *iovec, needs_lock);
 		if (!ret) {
@@ -2504,19 +2500,29 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
 	return ret;
 }
 
-static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size,
-			  struct iovec *iovec, struct iovec *fast_iov,
-			  struct iov_iter *iter)
+static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
+			  const struct iovec *fast_iov, struct iov_iter *iter)
 {
-	req->io->rw.nr_segs = iter->nr_segs;
-	req->io->rw.size = io_size;
-	req->io->rw.iov = iovec;
-	if (!req->io->rw.iov) {
-		req->io->rw.iov = req->io->rw.fast_iov;
-		if (req->io->rw.iov != fast_iov)
-			memcpy(req->io->rw.iov, fast_iov,
+	struct io_async_rw *rw = &req->io->rw;
+
+	memcpy(&rw->iter, iter, sizeof(*iter));
+	rw->free_iovec = NULL;
+	/* can only be fixed buffers, no need to do anything */
+	if (iter->type == ITER_BVEC)
+		return;
+	if (!iovec) {
+		unsigned iov_off = 0;
+
+		rw->iter.iov = rw->fast_iov;
+		if (iter->iov != fast_iov) {
+			iov_off = iter->iov - fast_iov;
+			rw->iter.iov += iov_off;
+		}
+		if (rw->fast_iov != fast_iov)
+			memcpy(rw->fast_iov + iov_off, fast_iov + iov_off,
 			       sizeof(struct iovec) * iter->nr_segs);
 	} else {
+		rw->free_iovec = iovec;
 		req->flags |= REQ_F_NEED_CLEANUP;
 	}
 }
@@ -2535,8 +2541,8 @@ static int io_alloc_async_ctx(struct io_kiocb *req)
 	return  __io_alloc_async_ctx(req);
 }
 
-static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
-			     struct iovec *iovec, struct iovec *fast_iov,
+static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
+			     const struct iovec *fast_iov,
 			     struct iov_iter *iter)
 {
 	if (!io_op_defs[req->opcode].async_ctx)
@@ -2545,7 +2551,7 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
 		if (__io_alloc_async_ctx(req))
 			return -ENOMEM;
 
-		io_req_map_rw(req, io_size, iovec, fast_iov, iter);
+		io_req_map_rw(req, iovec, fast_iov, iter);
 	}
 	return 0;
 }
@@ -2553,8 +2559,7 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
 static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 			bool force_nonblock)
 {
-	struct io_async_ctx *io;
-	struct iov_iter iter;
+	struct io_async_rw *iorw = &req->io->rw;
 	ssize_t ret;
 
 	ret = io_prep_rw(req, sqe, force_nonblock);
@@ -2568,15 +2573,17 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	if (!req->io || req->flags & REQ_F_NEED_CLEANUP)
 		return 0;
 
-	io = req->io;
-	io->rw.iov = io->rw.fast_iov;
+	iorw = &req->io->rw;
+	iorw->iter.iov = iorw->fast_iov;
+	/* reset ->io around the iovec import, we don't want to use it */
 	req->io = NULL;
-	ret = io_import_iovec(READ, req, &io->rw.iov, &iter, !force_nonblock);
-	req->io = io;
+	ret = io_import_iovec(READ, req, (struct iovec **) &iorw->iter.iov,
+				&iorw->iter, !force_nonblock);
+	req->io = container_of(iorw, struct io_async_ctx, rw);
 	if (ret < 0)
 		return ret;
 
-	io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
+	io_req_map_rw(req, iorw->iter.iov, iorw->fast_iov, &iorw->iter);
 	return 0;
 }
 
@@ -2584,11 +2591,14 @@ static int io_read(struct io_kiocb *req, bool force_nonblock)
 {
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 	struct kiocb *kiocb = &req->rw.kiocb;
-	struct iov_iter iter;
+	struct iov_iter __iter, *iter = &__iter;
 	size_t iov_count;
 	ssize_t io_size, ret;
 
-	ret = io_import_iovec(READ, req, &iovec, &iter, !force_nonblock);
+	if (req->io)
+		iter = &req->io->rw.iter;
+
+	ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
 	if (ret < 0)
 		return ret;
 
@@ -2608,15 +2618,15 @@ static int io_read(struct io_kiocb *req, bool force_nonblock)
 	if (force_nonblock && !io_file_supports_async(req->file, READ))
 		goto copy_iov;
 
-	iov_count = iov_iter_count(&iter);
+	iov_count = iov_iter_count(iter);
 	ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count);
 	if (!ret) {
 		ssize_t ret2;
 
 		if (req->file->f_op->read_iter)
-			ret2 = call_read_iter(req->file, kiocb, &iter);
+			ret2 = call_read_iter(req->file, kiocb, iter);
 		else if (req->file->f_op->read)
-			ret2 = loop_rw_iter(READ, req->file, kiocb, &iter);
+			ret2 = loop_rw_iter(READ, req->file, kiocb, iter);
 		else
 			ret2 = -EINVAL;
 
@@ -2625,8 +2635,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock)
 			kiocb_done(kiocb, ret2);
 		} else {
 copy_iov:
-			ret = io_setup_async_rw(req, io_size, iovec,
-						inline_vecs, &iter);
+			ret = io_setup_async_rw(req, iovec, inline_vecs, iter);
 			if (ret)
 				goto out_free;
 			/* any defer here is final, must blocking retry */
@@ -2645,8 +2654,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock)
 static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 			 bool force_nonblock)
 {
-	struct io_async_ctx *io;
-	struct iov_iter iter;
+	struct io_async_rw *iorw = &req->io->rw;
 	ssize_t ret;
 
 	ret = io_prep_rw(req, sqe, force_nonblock);
@@ -2662,15 +2670,16 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	if (!req->io || req->flags & REQ_F_NEED_CLEANUP)
 		return 0;
 
-	io = req->io;
-	io->rw.iov = io->rw.fast_iov;
+	iorw = &req->io->rw;
+	iorw->iter.iov = iorw->fast_iov;
 	req->io = NULL;
-	ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter, !force_nonblock);
-	req->io = io;
+	ret = io_import_iovec(WRITE, req, (struct iovec **) &iorw->iter.iov,
+				&iorw->iter, !force_nonblock);
+	req->io = container_of(iorw, struct io_async_ctx, rw);
 	if (ret < 0)
 		return ret;
 
-	io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
+	io_req_map_rw(req, iorw->iter.iov, iorw->fast_iov, &iorw->iter);
 	return 0;
 }
 
@@ -2678,11 +2687,14 @@ static int io_write(struct io_kiocb *req, bool force_nonblock)
 {
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 	struct kiocb *kiocb = &req->rw.kiocb;
-	struct iov_iter iter;
+	struct iov_iter __iter, *iter = &__iter;
 	size_t iov_count;
 	ssize_t ret, io_size;
 
-	ret = io_import_iovec(WRITE, req, &iovec, &iter, !force_nonblock);
+	if (req->io)
+		iter = &req->io->rw.iter;
+
+	ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
 	if (ret < 0)
 		return ret;
 
@@ -2707,7 +2719,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock)
 	    (req->flags & REQ_F_ISREG))
 		goto copy_iov;
 
-	iov_count = iov_iter_count(&iter);
+	iov_count = iov_iter_count(iter);
 	ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count);
 	if (!ret) {
 		ssize_t ret2;
@@ -2731,9 +2743,9 @@ static int io_write(struct io_kiocb *req, bool force_nonblock)
 			current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;
 
 		if (req->file->f_op->write_iter)
-			ret2 = call_write_iter(req->file, kiocb, &iter);
+			ret2 = call_write_iter(req->file, kiocb, iter);
 		else if (req->file->f_op->write)
-			ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter);
+			ret2 = loop_rw_iter(WRITE, req->file, kiocb, iter);
 		else
 			ret2 = -EINVAL;
 
@@ -2750,8 +2762,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock)
 			kiocb_done(kiocb, ret2);
 		} else {
 copy_iov:
-			ret = io_setup_async_rw(req, io_size, iovec,
-						inline_vecs, &iter);
+			ret = io_setup_async_rw(req, iovec, inline_vecs, iter);
 			if (ret)
 				goto out_free;
 			/* any defer here is final, must blocking retry */
@@ -5276,8 +5287,8 @@ static void io_cleanup_req(struct io_kiocb *req)
 	case IORING_OP_WRITEV:
 	case IORING_OP_WRITE_FIXED:
 	case IORING_OP_WRITE:
-		if (io->rw.iov != io->rw.fast_iov)
-			kfree(io->rw.iov);
+		if (io->rw.free_iovec)
+			kfree(io->rw.free_iovec);
 		break;
 	case IORING_OP_RECVMSG:
 		if (req->flags & REQ_F_BUFFER_SELECTED)
-- 
2.28.0


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCHSET v2 0/2] io_uring: handle short reads internally
  2020-08-18  4:12     ` Jens Axboe
@ 2020-08-18  4:30       ` Jens Axboe
  2020-08-18  7:40         ` Stefan Metzmacher
  0 siblings, 1 reply; 15+ messages in thread
From: Jens Axboe @ 2020-08-18  4:30 UTC (permalink / raw)
  To: Stefan Metzmacher, io-uring; +Cc: david, jmoyer

On 8/17/20 9:12 PM, Jens Axboe wrote:
> On 8/17/20 8:29 PM, Jens Axboe wrote:
>> On 8/17/20 2:25 AM, Stefan Metzmacher wrote:
>>> Hi Jens,
>>>
>>>> Since we've had a few cases of applications not dealing with this
>>>> appopriately, I believe the safest course of action is to ensure that
>>>> we don't return short reads when we really don't have to.
>>>>
>>>> The first patch is just a prep patch that retains iov_iter state over
>>>> retries, while the second one actually enables just doing retries if
>>>> we get a short read back.
>>>>
>>>> This passes all my testing, both liburing regression tests but also
>>>> tests that explicitly trigger internal short reads and hence retry
>>>> based on current state. No short reads are passed back to the
>>>> application.
>>>
>>> Thanks! I was going to ask about exactly that :-)
>>>
>>> It wasn't clear why returning short reads were justified by resulting
>>> in better performance... As it means the application needs to do
>>> a lot more work and syscalls.
>>
>> It mostly boils down to figuring out a good way to do it. With the
>> task_work based retry, the async buffered reads, we're almost there and
>> the prep patch adds the last remaining bits to retain the iov_iter state
>> across issues.
>>
>>> Will this be backported?
>>
>> I can, but not really in an efficient manner. It depends on the async
>> buffered work to make progress, and the task_work handling retry. The
>> latter means it's 5.7+, while the former is only in 5.9+...
>>
>> We can make it work for earlier kernels by just using the thread offload
>> for that, and that may be worth doing. That would enable it in
>> 5.7-stable and 5.8-stable. For that, you just need these two patches.
>> Patch 1 would work as-is, while patch 2 would need a small bit of
>> massaging since io_read() doesn't have the retry parts.
>>
>> I'll give it a whirl just out of curiosity, then we can debate it after
>> that.
> 
> Here are the two patches against latest 5.7-stable (the rc branch, as
> we had quite a few queued up after 5.9-rc1). Totally untested, just
> wanted to see if it was doable.
> 
> First patch is mostly just applied, with various bits removed that we
> don't have in 5.7. The second patch just does -EAGAIN punt for the
> short read case, which will queue the remainder with io-wq for
> async execution.
> 
> Obviously needs quite a bit of testing before it can go anywhere else,
> but wanted to throw this out there in case you were interested in
> giving it a go...

Actually passes basic testing, and doesn't return short reads. So at
least it's not half bad, and it should be safe for you to test.

I quickly looked at 5.8 as well, and the good news is that the same
patches will apply there without changes.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCHSET v2 0/2] io_uring: handle short reads internally
  2020-08-18  4:30       ` Jens Axboe
@ 2020-08-18  7:40         ` Stefan Metzmacher
  2020-08-18 14:44           ` Jens Axboe
  0 siblings, 1 reply; 15+ messages in thread
From: Stefan Metzmacher @ 2020-08-18  7:40 UTC (permalink / raw)
  To: Jens Axboe, io-uring, Anoop C S; +Cc: david, jmoyer


[-- Attachment #1.1: Type: text/plain, Size: 1955 bytes --]

Hi Jens,

>>>> Will this be backported?
>>>
>>> I can, but not really in an efficient manner. It depends on the async
>>> buffered work to make progress, and the task_work handling retry. The
>>> latter means it's 5.7+, while the former is only in 5.9+...
>>>
>>> We can make it work for earlier kernels by just using the thread offload
>>> for that, and that may be worth doing. That would enable it in
>>> 5.7-stable and 5.8-stable. For that, you just need these two patches.
>>> Patch 1 would work as-is, while patch 2 would need a small bit of
>>> massaging since io_read() doesn't have the retry parts.
>>>
>>> I'll give it a whirl just out of curiosity, then we can debate it after
>>> that.
>>
>> Here are the two patches against latest 5.7-stable (the rc branch, as
>> we had quite a few queued up after 5.9-rc1). Totally untested, just
>> wanted to see if it was doable.
>>
>> First patch is mostly just applied, with various bits removed that we
>> don't have in 5.7. The second patch just does -EAGAIN punt for the
>> short read case, which will queue the remainder with io-wq for
>> async execution.
>>
>> Obviously needs quite a bit of testing before it can go anywhere else,
>> but wanted to throw this out there in case you were interested in
>> giving it a go...
> 
> Actually passes basic testing, and doesn't return short reads. So at
> least it's not half bad, and it should be safe for you to test.
> 
> I quickly looked at 5.8 as well, and the good news is that the same
> patches will apply there without changes.

Thanks, but I was just curios and I currently don't have the environment to test, sorry.

Anoop: you helped a lot reproducing the problem with 5.6, would you be able to
test the kernel patches against 5.7 or 5.8, while reverting the samba patches?
See https://lore.kernel.org/io-uring/[email protected]/T/#t for the
whole discussion?

metze



[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCHSET v2 0/2] io_uring: handle short reads internally
  2020-08-18  7:40         ` Stefan Metzmacher
@ 2020-08-18 14:44           ` Jens Axboe
  2020-08-18 14:49             ` Anoop C S
  0 siblings, 1 reply; 15+ messages in thread
From: Jens Axboe @ 2020-08-18 14:44 UTC (permalink / raw)
  To: Stefan Metzmacher, io-uring, Anoop C S; +Cc: david, jmoyer

On 8/18/20 12:40 AM, Stefan Metzmacher wrote:
> Hi Jens,
> 
>>>>> Will this be backported?
>>>>
>>>> I can, but not really in an efficient manner. It depends on the async
>>>> buffered work to make progress, and the task_work handling retry. The
>>>> latter means it's 5.7+, while the former is only in 5.9+...
>>>>
>>>> We can make it work for earlier kernels by just using the thread offload
>>>> for that, and that may be worth doing. That would enable it in
>>>> 5.7-stable and 5.8-stable. For that, you just need these two patches.
>>>> Patch 1 would work as-is, while patch 2 would need a small bit of
>>>> massaging since io_read() doesn't have the retry parts.
>>>>
>>>> I'll give it a whirl just out of curiosity, then we can debate it after
>>>> that.
>>>
>>> Here are the two patches against latest 5.7-stable (the rc branch, as
>>> we had quite a few queued up after 5.9-rc1). Totally untested, just
>>> wanted to see if it was doable.
>>>
>>> First patch is mostly just applied, with various bits removed that we
>>> don't have in 5.7. The second patch just does -EAGAIN punt for the
>>> short read case, which will queue the remainder with io-wq for
>>> async execution.
>>>
>>> Obviously needs quite a bit of testing before it can go anywhere else,
>>> but wanted to throw this out there in case you were interested in
>>> giving it a go...
>>
>> Actually passes basic testing, and doesn't return short reads. So at
>> least it's not half bad, and it should be safe for you to test.
>>
>> I quickly looked at 5.8 as well, and the good news is that the same
>> patches will apply there without changes.
> 
> Thanks, but I was just curios and I currently don't have the environment to test, sorry.
> 
> Anoop: you helped a lot reproducing the problem with 5.6, would you be able to
> test the kernel patches against 5.7 or 5.8, while reverting the samba patches?
> See https://lore.kernel.org/io-uring/[email protected]/T/#t for the
> whole discussion?

I'm actually not too worried about the short reads not working, it'll
naturally fall out correctly if the rest of the path is sane. The latter
is what I'd be worried about! I ran some synthetic testing and haven't
seen any issues so far, so maybe (just maybe) it's actually good.

I can setup two branches with the 5.7-stable + patches and 5.8-stable +
patches if that helps facilitate testing?

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCHSET v2 0/2] io_uring: handle short reads internally
  2020-08-18 14:44           ` Jens Axboe
@ 2020-08-18 14:49             ` Anoop C S
  2020-08-18 14:53               ` Jens Axboe
  0 siblings, 1 reply; 15+ messages in thread
From: Anoop C S @ 2020-08-18 14:49 UTC (permalink / raw)
  To: Jens Axboe, Stefan Metzmacher, io-uring; +Cc: david, jmoyer

On Tue, 2020-08-18 at 07:44 -0700, Jens Axboe wrote:
> On 8/18/20 12:40 AM, Stefan Metzmacher wrote:
> > Hi Jens,
> > 
> > > > > > Will this be backported?
> > > > > 
> > > > > I can, but not really in an efficient manner. It depends on
> > > > > the async
> > > > > buffered work to make progress, and the task_work handling
> > > > > retry. The
> > > > > latter means it's 5.7+, while the former is only in 5.9+...
> > > > > 
> > > > > We can make it work for earlier kernels by just using the
> > > > > thread offload
> > > > > for that, and that may be worth doing. That would enable it
> > > > > in
> > > > > 5.7-stable and 5.8-stable. For that, you just need these two
> > > > > patches.
> > > > > Patch 1 would work as-is, while patch 2 would need a small
> > > > > bit of
> > > > > massaging since io_read() doesn't have the retry parts.
> > > > > 
> > > > > I'll give it a whirl just out of curiosity, then we can
> > > > > debate it after
> > > > > that.
> > > > 
> > > > Here are the two patches against latest 5.7-stable (the rc
> > > > branch, as
> > > > we had quite a few queued up after 5.9-rc1). Totally untested,
> > > > just
> > > > wanted to see if it was doable.
> > > > 
> > > > First patch is mostly just applied, with various bits removed
> > > > that we
> > > > don't have in 5.7. The second patch just does -EAGAIN punt for
> > > > the
> > > > short read case, which will queue the remainder with io-wq for
> > > > async execution.
> > > > 
> > > > Obviously needs quite a bit of testing before it can go
> > > > anywhere else,
> > > > but wanted to throw this out there in case you were interested
> > > > in
> > > > giving it a go...
> > > 
> > > Actually passes basic testing, and doesn't return short reads. So
> > > at
> > > least it's not half bad, and it should be safe for you to test.
> > > 
> > > I quickly looked at 5.8 as well, and the good news is that the
> > > same
> > > patches will apply there without changes.
> > 
> > Thanks, but I was just curios and I currently don't have the
> > environment to test, sorry.
> > 
> > Anoop: you helped a lot reproducing the problem with 5.6, would you
> > be able to
> > test the kernel patches against 5.7 or 5.8, while reverting the
> > samba patches?
> > See 
> > https://lore.kernel.org/io-uring/[email protected]/T/#t
> > for the
> > whole discussion?
> 
> I'm actually not too worried about the short reads not working, it'll
> naturally fall out correctly if the rest of the path is sane. The
> latter
> is what I'd be worried about! I ran some synthetic testing and
> haven't
> seen any issues so far, so maybe (just maybe) it's actually good.
> 
> I can setup two branches with the 5.7-stable + patches and 5.8-stable 
> +
> patches if that helps facilitate testing?

That would be great.

I took those two patches and tried to apply on top of 5.7.y. I had to
manually resolve very few conflicts. I am not sure whether it is OK or
not to test such a patched version(because of conflicts).

Thanks,
Anoop C S.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCHSET v2 0/2] io_uring: handle short reads internally
  2020-08-18 14:49             ` Anoop C S
@ 2020-08-18 14:53               ` Jens Axboe
  2020-08-18 15:23                 ` Jens Axboe
  0 siblings, 1 reply; 15+ messages in thread
From: Jens Axboe @ 2020-08-18 14:53 UTC (permalink / raw)
  To: Anoop C S, Stefan Metzmacher, io-uring; +Cc: david, jmoyer

On 8/18/20 7:49 AM, Anoop C S wrote:
> On Tue, 2020-08-18 at 07:44 -0700, Jens Axboe wrote:
>> On 8/18/20 12:40 AM, Stefan Metzmacher wrote:
>>> Hi Jens,
>>>
>>>>>>> Will this be backported?
>>>>>>
>>>>>> I can, but not really in an efficient manner. It depends on
>>>>>> the async
>>>>>> buffered work to make progress, and the task_work handling
>>>>>> retry. The
>>>>>> latter means it's 5.7+, while the former is only in 5.9+...
>>>>>>
>>>>>> We can make it work for earlier kernels by just using the
>>>>>> thread offload
>>>>>> for that, and that may be worth doing. That would enable it
>>>>>> in
>>>>>> 5.7-stable and 5.8-stable. For that, you just need these two
>>>>>> patches.
>>>>>> Patch 1 would work as-is, while patch 2 would need a small
>>>>>> bit of
>>>>>> massaging since io_read() doesn't have the retry parts.
>>>>>>
>>>>>> I'll give it a whirl just out of curiosity, then we can
>>>>>> debate it after
>>>>>> that.
>>>>>
>>>>> Here are the two patches against latest 5.7-stable (the rc
>>>>> branch, as
>>>>> we had quite a few queued up after 5.9-rc1). Totally untested,
>>>>> just
>>>>> wanted to see if it was doable.
>>>>>
>>>>> First patch is mostly just applied, with various bits removed
>>>>> that we
>>>>> don't have in 5.7. The second patch just does -EAGAIN punt for
>>>>> the
>>>>> short read case, which will queue the remainder with io-wq for
>>>>> async execution.
>>>>>
>>>>> Obviously needs quite a bit of testing before it can go
>>>>> anywhere else,
>>>>> but wanted to throw this out there in case you were interested
>>>>> in
>>>>> giving it a go...
>>>>
>>>> Actually passes basic testing, and doesn't return short reads. So
>>>> at
>>>> least it's not half bad, and it should be safe for you to test.
>>>>
>>>> I quickly looked at 5.8 as well, and the good news is that the
>>>> same
>>>> patches will apply there without changes.
>>>
>>> Thanks, but I was just curios and I currently don't have the
>>> environment to test, sorry.
>>>
>>> Anoop: you helped a lot reproducing the problem with 5.6, would you
>>> be able to
>>> test the kernel patches against 5.7 or 5.8, while reverting the
>>> samba patches?
>>> See 
>>> https://lore.kernel.org/io-uring/[email protected]/T/#t
>>> for the
>>> whole discussion?
>>
>> I'm actually not too worried about the short reads not working, it'll
>> naturally fall out correctly if the rest of the path is sane. The
>> latter
>> is what I'd be worried about! I ran some synthetic testing and
>> haven't
>> seen any issues so far, so maybe (just maybe) it's actually good.
>>
>> I can setup two branches with the 5.7-stable + patches and 5.8-stable 
>> +
>> patches if that helps facilitate testing?
> 
> That would be great.
> 
> I took those two patches and tried to apply on top of 5.7.y. I had to
> manually resolve very few conflicts. I am not sure whether it is OK or
> not to test such a patched version(because of conflicts).

I pushed out two branches:

5.8-stable: current 5.8-stable rc queue + the three patches for this
5.7-stable: 5.7 ditto

So pick which one you want to use, and then pull it.

git://git.kernel.dk/linux-block 5.8-stable

git://git.kernel.dk/linux-block 5.7-stable

Hope that helps!

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCHSET v2 0/2] io_uring: handle short reads internally
  2020-08-18 14:53               ` Jens Axboe
@ 2020-08-18 15:23                 ` Jens Axboe
       [not found]                   ` <[email protected]>
  0 siblings, 1 reply; 15+ messages in thread
From: Jens Axboe @ 2020-08-18 15:23 UTC (permalink / raw)
  To: Anoop C S, Stefan Metzmacher, io-uring; +Cc: david, jmoyer

On 8/18/20 7:53 AM, Jens Axboe wrote:
> On 8/18/20 7:49 AM, Anoop C S wrote:
>> On Tue, 2020-08-18 at 07:44 -0700, Jens Axboe wrote:
>>> On 8/18/20 12:40 AM, Stefan Metzmacher wrote:
>>>> Hi Jens,
>>>>
>>>>>>>> Will this be backported?
>>>>>>>
>>>>>>> I can, but not really in an efficient manner. It depends on
>>>>>>> the async
>>>>>>> buffered work to make progress, and the task_work handling
>>>>>>> retry. The
>>>>>>> latter means it's 5.7+, while the former is only in 5.9+...
>>>>>>>
>>>>>>> We can make it work for earlier kernels by just using the
>>>>>>> thread offload
>>>>>>> for that, and that may be worth doing. That would enable it
>>>>>>> in
>>>>>>> 5.7-stable and 5.8-stable. For that, you just need these two
>>>>>>> patches.
>>>>>>> Patch 1 would work as-is, while patch 2 would need a small
>>>>>>> bit of
>>>>>>> massaging since io_read() doesn't have the retry parts.
>>>>>>>
>>>>>>> I'll give it a whirl just out of curiosity, then we can
>>>>>>> debate it after
>>>>>>> that.
>>>>>>
>>>>>> Here are the two patches against latest 5.7-stable (the rc
>>>>>> branch, as
>>>>>> we had quite a few queued up after 5.9-rc1). Totally untested,
>>>>>> just
>>>>>> wanted to see if it was doable.
>>>>>>
>>>>>> First patch is mostly just applied, with various bits removed
>>>>>> that we
>>>>>> don't have in 5.7. The second patch just does -EAGAIN punt for
>>>>>> the
>>>>>> short read case, which will queue the remainder with io-wq for
>>>>>> async execution.
>>>>>>
>>>>>> Obviously needs quite a bit of testing before it can go
>>>>>> anywhere else,
>>>>>> but wanted to throw this out there in case you were interested
>>>>>> in
>>>>>> giving it a go...
>>>>>
>>>>> Actually passes basic testing, and doesn't return short reads. So
>>>>> at
>>>>> least it's not half bad, and it should be safe for you to test.
>>>>>
>>>>> I quickly looked at 5.8 as well, and the good news is that the
>>>>> same
>>>>> patches will apply there without changes.
>>>>
>>>> Thanks, but I was just curios and I currently don't have the
>>>> environment to test, sorry.
>>>>
>>>> Anoop: you helped a lot reproducing the problem with 5.6, would you
>>>> be able to
>>>> test the kernel patches against 5.7 or 5.8, while reverting the
>>>> samba patches?
>>>> See 
>>>> https://lore.kernel.org/io-uring/[email protected]/T/#t
>>>> for the
>>>> whole discussion?
>>>
>>> I'm actually not too worried about the short reads not working, it'll
>>> naturally fall out correctly if the rest of the path is sane. The
>>> latter
>>> is what I'd be worried about! I ran some synthetic testing and
>>> haven't
>>> seen any issues so far, so maybe (just maybe) it's actually good.
>>>
>>> I can setup two branches with the 5.7-stable + patches and 5.8-stable 
>>> +
>>> patches if that helps facilitate testing?
>>
>> That would be great.
>>
>> I took those two patches and tried to apply on top of 5.7.y. I had to
>> manually resolve very few conflicts. I am not sure whether it is OK or
>> not to test such a patched version(because of conflicts).
> 
> I pushed out two branches:
> 
> 5.8-stable: current 5.8-stable rc queue + the three patches for this
> 5.7-stable: 5.7 ditto
> 
> So pick which one you want to use, and then pull it.
> 
> git://git.kernel.dk/linux-block 5.8-stable
> 
> git://git.kernel.dk/linux-block 5.7-stable
> 
> Hope that helps!

Ran these through the liburing regression testing as well, and found a
case where 'ret2' isn't initialized. So pushed out new branches. The
correct sha for testing should be:

5.7-stable: a451911d530075352fbc7ef9bb2df68145a747ad
5.8-stable: b101e651782a60eb1e96b64e523e51358b77f94f

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCHSET v2 0/2] io_uring: handle short reads internally
       [not found]                   ` <[email protected]>
@ 2020-08-19  8:31                     ` Stefan Metzmacher
  2020-08-19 12:48                       ` Jens Axboe
  0 siblings, 1 reply; 15+ messages in thread
From: Stefan Metzmacher @ 2020-08-19  8:31 UTC (permalink / raw)
  To: Anoop C S, Jens Axboe; +Cc: david, jmoyer, io-uring


[-- Attachment #1.1: Type: text/plain, Size: 1139 bytes --]

Hi Anoop,

> @metze,
> 
> Looks like issue is fixed with patched version of v5.7.15. For the
> record I am noting down the steps followed:
> 
> Following is my configuration:
> OS: Fedora 32
> Kernel: 5.7.15 [5.7.15-200]
>         5.7.15+patches from 5.7-stable branch [5.7.15-202]
> Samba: 4.12.6 [4.12.6-0]
>        4.12.6+git reverts for vfs_io_uring [4.12.6-1]
> liburing: 0.7
> Client: Win 10, build 1909
> Issue: SHA256 mismatch on files copied from io_uring enabled Samba
>        share using Windows explorer
> 
> * Issue not seen with unpatched 5.7.15 and samba-4.12.6
> 
> * prepared samba-4.12.6 build[1] with reverts(see attachment) for
> vfs_io_uring(as per https://bugzilla.samba.org/show_bug.cgi?id=14361)
> and reproduced issue with unpatched 5.7.15.
> 
> * prepared kernel-5.7.15 build[2] with extra commits(see attachment) as
> per 5.7-stable branch from git://git.kernel.dk/linux-block. SHA256
> mismatch is no longer seen.
> 
> [1] https://koji.fedoraproject.org/koji/taskinfo?taskID=49539069
> [2] https://koji.fedoraproject.org/koji/taskinfo?taskID=49585679

Great, thanks!

metze


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCHSET v2 0/2] io_uring: handle short reads internally
  2020-08-19  8:31                     ` Stefan Metzmacher
@ 2020-08-19 12:48                       ` Jens Axboe
  0 siblings, 0 replies; 15+ messages in thread
From: Jens Axboe @ 2020-08-19 12:48 UTC (permalink / raw)
  To: Stefan Metzmacher, Anoop C S; +Cc: david, jmoyer, io-uring

On 8/19/20 1:31 AM, Stefan Metzmacher wrote:
> Hi Anoop,
> 
>> @metze,
>>
>> Looks like issue is fixed with patched version of v5.7.15. For the
>> record I am noting down the steps followed:
>>
>> Following is my configuration:
>> OS: Fedora 32
>> Kernel: 5.7.15 [5.7.15-200]
>>         5.7.15+patches from 5.7-stable branch [5.7.15-202]
>> Samba: 4.12.6 [4.12.6-0]
>>        4.12.6+git reverts for vfs_io_uring [4.12.6-1]
>> liburing: 0.7
>> Client: Win 10, build 1909
>> Issue: SHA256 mismatch on files copied from io_uring enabled Samba
>>        share using Windows explorer
>>
>> * Issue not seen with unpatched 5.7.15 and samba-4.12.6
>>
>> * prepared samba-4.12.6 build[1] with reverts(see attachment) for
>> vfs_io_uring(as per https://bugzilla.samba.org/show_bug.cgi?id=14361)
>> and reproduced issue with unpatched 5.7.15.
>>
>> * prepared kernel-5.7.15 build[2] with extra commits(see attachment) as
>> per 5.7-stable branch from git://git.kernel.dk/linux-block. SHA256
>> mismatch is no longer seen.
>>
>> [1] https://koji.fedoraproject.org/koji/taskinfo?taskID=49539069
>> [2] https://koji.fedoraproject.org/koji/taskinfo?taskID=49585679
> 
> Great, thanks!

Indeed, thanks a lot for retesting! It helps validate that the backport
is sane in general, both from a "now handles short reads and retries them
without issue" and the "didn't mess anything else up majorly" side
of things :-)

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2020-08-19 12:49 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2020-08-14 19:54 [PATCHSET v2 0/2] io_uring: handle short reads internally Jens Axboe
2020-08-14 19:54 ` [PATCH 1/2] io_uring: retain iov_iter state over io_read/io_write calls Jens Axboe
2020-08-14 19:54 ` [PATCH 2/2] io_uring: internally retry short reads Jens Axboe
2020-08-17  9:25 ` [PATCHSET v2 0/2] io_uring: handle short reads internally Stefan Metzmacher
2020-08-18  3:29   ` Jens Axboe
2020-08-18  4:12     ` Jens Axboe
2020-08-18  4:30       ` Jens Axboe
2020-08-18  7:40         ` Stefan Metzmacher
2020-08-18 14:44           ` Jens Axboe
2020-08-18 14:49             ` Anoop C S
2020-08-18 14:53               ` Jens Axboe
2020-08-18 15:23                 ` Jens Axboe
     [not found]                   ` <[email protected]>
2020-08-19  8:31                     ` Stefan Metzmacher
2020-08-19 12:48                       ` Jens Axboe
  -- strict thread matches above, loose matches on Subject: below --
2020-08-13 17:56 [PATCHSET " Jens Axboe
2020-08-13 17:56 ` [PATCH 1/2] io_uring: retain iov_iter state over io_read/io_write calls Jens Axboe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox