public inbox for [email protected]
 help / color / mirror / Atom feed
* [PATCHSET 0/2] Ensure all needed read/write data is stable
@ 2019-12-02 22:30 Jens Axboe
  2019-12-02 22:31 ` [PATCH 1/2] io_uring: add general async offload context Jens Axboe
  2019-12-02 22:31 ` [PATCH 2/2] io_uring: ensure async punted read/write requests copy iovec Jens Axboe
  0 siblings, 2 replies; 3+ messages in thread
From: Jens Axboe @ 2019-12-02 22:30 UTC (permalink / raw)
  To: io-uring; +Cc: carter.li

Currently we don't copy the associated iovecs when we punt to async
context, and this can prove problematic if the caller only ensures
the iovec is valid for the submission. This is a perfectly valid
assumption, and I think io_uring should make sure that this is safe
to do.

First patch is a prep patch, second patch adds the necessary support
for IORING_OP_READV and IORING_OP_WRITEV to handle this appropriately.
We really need to do the same for the other calls that pass in pointers
to structs, like sendmsg/recvmsg, connect, and accept. I will deal with
those next.

Also see: https://github.com/axboe/liburing/issues/27

-- 
Jens Axboe



^ permalink raw reply	[flat|nested] 3+ messages in thread

* [PATCH 1/2] io_uring: add general async offload context
  2019-12-02 22:30 [PATCHSET 0/2] Ensure all needed read/write data is stable Jens Axboe
@ 2019-12-02 22:31 ` Jens Axboe
  2019-12-02 22:31 ` [PATCH 2/2] io_uring: ensure async punted read/write requests copy iovec Jens Axboe
  1 sibling, 0 replies; 3+ messages in thread
From: Jens Axboe @ 2019-12-02 22:31 UTC (permalink / raw)
  To: io-uring; +Cc: carter.li, Jens Axboe

Right now we just copy the sqe for async offload, but we want to store
more context across an async punt. In preparation for doing so, put the
sqe copy inside a structure that we can expand. With this pointer added,
we can get rid of REQ_F_FREE_SQE, as that is now indicated by whether
req->io is NULL or not.

No functional changes in this patch.

Signed-off-by: Jens Axboe <[email protected]>
---
 fs/io_uring.c | 56 +++++++++++++++++++++++++++++----------------------
 1 file changed, 32 insertions(+), 24 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5cab7a047317..0328f565c0c9 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -308,6 +308,10 @@ struct io_timeout {
 	struct io_timeout_data		*data;
 };
 
+struct io_async_ctx {
+	struct io_uring_sqe		sqe;
+};
+
 /*
  * NOTE! Each of the iocb union members has the file pointer
  * as the first entry in their struct definition. So you can
@@ -323,6 +327,7 @@ struct io_kiocb {
 	};
 
 	const struct io_uring_sqe	*sqe;
+	struct io_async_ctx		*io;
 	struct file			*ring_file;
 	int				ring_fd;
 	bool				has_user;
@@ -353,7 +358,6 @@ struct io_kiocb {
 #define REQ_F_TIMEOUT_NOSEQ	8192	/* no timeout sequence */
 #define REQ_F_INFLIGHT		16384	/* on inflight list */
 #define REQ_F_COMP_LOCKED	32768	/* completion under lock */
-#define REQ_F_FREE_SQE		65536	/* free sqe if not async queued */
 	u64			user_data;
 	u32			result;
 	u32			sequence;
@@ -806,6 +810,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
 	}
 
 got_it:
+	req->io = NULL;
 	req->ring_file = NULL;
 	req->file = NULL;
 	req->ctx = ctx;
@@ -836,8 +841,8 @@ static void __io_free_req(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
-	if (req->flags & REQ_F_FREE_SQE)
-		kfree(req->sqe);
+	if (req->io)
+		kfree(req->io);
 	if (req->file && !(req->flags & REQ_F_FIXED_FILE))
 		fput(req->file);
 	if (req->flags & REQ_F_INFLIGHT) {
@@ -1079,9 +1084,9 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 			 * completions for those, only batch free for fixed
 			 * file and non-linked commands.
 			 */
-			if (((req->flags &
-				(REQ_F_FIXED_FILE|REQ_F_LINK|REQ_F_FREE_SQE)) ==
-			    REQ_F_FIXED_FILE) && !io_is_fallback_req(req)) {
+			if (((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) ==
+			    REQ_F_FIXED_FILE) && !io_is_fallback_req(req) &&
+			    !req->io) {
 				reqs[to_free++] = req;
 				if (to_free == ARRAY_SIZE(reqs))
 					io_free_req_many(ctx, reqs, &to_free);
@@ -2257,7 +2262,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	if (!poll->wait)
 		return -ENOMEM;
 
-	req->sqe = NULL;
+	req->io = NULL;
 	INIT_IO_WORK(&req->work, io_poll_complete_work);
 	events = READ_ONCE(sqe->poll_events);
 	poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
@@ -2600,27 +2605,27 @@ static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 
 static int io_req_defer(struct io_kiocb *req)
 {
-	struct io_uring_sqe *sqe_copy;
 	struct io_ring_ctx *ctx = req->ctx;
+	struct io_async_ctx *io;
 
 	/* Still need defer if there is pending req in defer list. */
 	if (!req_need_defer(req) && list_empty(&ctx->defer_list))
 		return 0;
 
-	sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL);
-	if (!sqe_copy)
+	io = kmalloc(sizeof(*io), GFP_KERNEL);
+	if (!io)
 		return -EAGAIN;
 
 	spin_lock_irq(&ctx->completion_lock);
 	if (!req_need_defer(req) && list_empty(&ctx->defer_list)) {
 		spin_unlock_irq(&ctx->completion_lock);
-		kfree(sqe_copy);
+		kfree(io);
 		return 0;
 	}
 
-	memcpy(sqe_copy, req->sqe, sizeof(*sqe_copy));
-	req->flags |= REQ_F_FREE_SQE;
-	req->sqe = sqe_copy;
+	memcpy(&io->sqe, req->sqe, sizeof(io->sqe));
+	req->sqe = &io->sqe;
+	req->io = io;
 
 	trace_io_uring_defer(ctx, req, req->user_data);
 	list_add_tail(&req->list, &ctx->defer_list);
@@ -2953,14 +2958,16 @@ static void __io_queue_sqe(struct io_kiocb *req)
 	 */
 	if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
 	    (req->flags & REQ_F_MUST_PUNT))) {
-		struct io_uring_sqe *sqe_copy;
+		struct io_async_ctx *io;
 
-		sqe_copy = kmemdup(req->sqe, sizeof(*sqe_copy), GFP_KERNEL);
-		if (!sqe_copy)
+		io = kmalloc(sizeof(*io), GFP_KERNEL);
+		if (!io)
 			goto err;
 
-		req->sqe = sqe_copy;
-		req->flags |= REQ_F_FREE_SQE;
+		memcpy(&io->sqe, req->sqe, sizeof(io->sqe));
+
+		req->sqe = &io->sqe;
+		req->io = io;
 
 		if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) {
 			ret = io_grab_files(req);
@@ -3061,7 +3068,7 @@ static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state,
 	 */
 	if (*link) {
 		struct io_kiocb *prev = *link;
-		struct io_uring_sqe *sqe_copy;
+		struct io_async_ctx *io;
 
 		if (req->sqe->flags & IOSQE_IO_DRAIN)
 			(*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN;
@@ -3077,14 +3084,15 @@ static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state,
 			}
 		}
 
-		sqe_copy = kmemdup(req->sqe, sizeof(*sqe_copy), GFP_KERNEL);
-		if (!sqe_copy) {
+		io = kmalloc(sizeof(*io), GFP_KERNEL);
+		if (!io) {
 			ret = -EAGAIN;
 			goto err_req;
 		}
 
-		req->sqe = sqe_copy;
-		req->flags |= REQ_F_FREE_SQE;
+		memcpy(&io->sqe, req->sqe, sizeof(io->sqe));
+		req->sqe = &io->sqe;
+		req->io = io;
 		trace_io_uring_link(ctx, req, prev);
 		list_add_tail(&req->list, &prev->link_list);
 	} else if (req->sqe->flags & IOSQE_IO_LINK) {
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [PATCH 2/2] io_uring: ensure async punted read/write requests copy iovec
  2019-12-02 22:30 [PATCHSET 0/2] Ensure all needed read/write data is stable Jens Axboe
  2019-12-02 22:31 ` [PATCH 1/2] io_uring: add general async offload context Jens Axboe
@ 2019-12-02 22:31 ` Jens Axboe
  1 sibling, 0 replies; 3+ messages in thread
From: Jens Axboe @ 2019-12-02 22:31 UTC (permalink / raw)
  To: io-uring; +Cc: carter.li, Jens Axboe

Currently we don't copy the iovecs when we punt to async context. This
can be problematic for applications that store the iovec on the stack,
as they often assume that it's safe to let the iovec go out of scope
as soon as IO submission has been called. This isn't always safe, as we
will re-copy the iovec once we're in async context.

Make this 100% safe by copying the iovec just once. With this change,
applications may safely store the iovec on the stack for all cases.

Reported-by: 李通洲 <[email protected]>
Signed-off-by: Jens Axboe <[email protected]>
---
 fs/io_uring.c | 241 +++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 179 insertions(+), 62 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0328f565c0c9..ccc98042b28d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -308,8 +308,18 @@ struct io_timeout {
 	struct io_timeout_data		*data;
 };
 
+struct io_async_rw {
+	struct iovec			fast_iov[UIO_FASTIOV];
+	struct iovec			*iov;
+	ssize_t				nr_segs;
+	ssize_t				size;
+};
+
 struct io_async_ctx {
 	struct io_uring_sqe		sqe;
+	union {
+		struct io_async_rw	rw;
+	};
 };
 
 /*
@@ -358,6 +368,7 @@ struct io_kiocb {
 #define REQ_F_TIMEOUT_NOSEQ	8192	/* no timeout sequence */
 #define REQ_F_INFLIGHT		16384	/* on inflight list */
 #define REQ_F_COMP_LOCKED	32768	/* completion under lock */
+#define REQ_F_PREPPED		65536	/* has done prep */
 	u64			user_data;
 	u32			result;
 	u32			sequence;
@@ -1415,15 +1426,6 @@ static int io_prep_rw(struct io_kiocb *req, bool force_nonblock)
 	if (S_ISREG(file_inode(req->file)->i_mode))
 		req->flags |= REQ_F_ISREG;
 
-	/*
-	 * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
-	 * we know to async punt it even if it was opened O_NONBLOCK
-	 */
-	if (force_nonblock && !io_file_supports_async(req->file)) {
-		req->flags |= REQ_F_MUST_PUNT;
-		return -EAGAIN;
-	}
-
 	kiocb->ki_pos = READ_ONCE(sqe->off);
 	kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
 	kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
@@ -1592,6 +1594,16 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
 		return io_import_fixed(req->ctx, rw, sqe, iter);
 	}
 
+	if (req->io) {
+		struct io_async_rw *iorw = &req->io->rw;
+
+		*iovec = iorw->iov;
+		iov_iter_init(iter, rw, *iovec, iorw->nr_segs, iorw->size);
+		if (iorw->iov == iorw->fast_iov)
+			*iovec = NULL;
+		return iorw->size;
+	}
+
 	if (!req->has_user)
 		return -EFAULT;
 
@@ -1662,6 +1674,53 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
 	return ret;
 }
 
+static void io_req_map_io(struct io_kiocb *req, ssize_t io_size,
+			  struct iovec *iovec, struct iovec *fast_iov,
+			  struct iov_iter *iter)
+{
+	req->io->rw.nr_segs = iter->nr_segs;
+	req->io->rw.size = io_size;
+	req->io->rw.iov = iovec;
+	if (!req->io->rw.iov) {
+		req->io->rw.iov = req->io->rw.fast_iov;
+		memcpy(req->io->rw.iov, fast_iov,
+			sizeof(struct iovec) * iter->nr_segs);
+	}
+
+	memcpy(&req->io->sqe, req->sqe, sizeof(req->io->sqe));
+	req->sqe = &req->io->sqe;
+
+	req->flags |= REQ_F_PREPPED;
+}
+
+static int io_setup_async_io(struct io_kiocb *req, ssize_t io_size,
+			     struct iovec *iovec, struct iovec *fast_iov,
+			     struct iov_iter *iter)
+{
+	req->io = kmalloc(sizeof(*req->io), GFP_KERNEL);
+	if (req->io) {
+		io_req_map_io(req, io_size, iovec, fast_iov, iter);
+		return 0;
+	}
+
+	return -ENOMEM;
+}
+
+static int io_read_prep(struct io_kiocb *req, struct iovec **iovec,
+			struct iov_iter *iter, bool force_nonblock)
+{
+	ssize_t ret;
+
+	ret = io_prep_rw(req, force_nonblock);
+	if (ret)
+		return ret;
+
+	if (unlikely(!(req->file->f_mode & FMODE_READ)))
+		return -EBADF;
+
+	return io_import_iovec(READ, req, iovec, iter);
+}
+
 static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
 		   bool force_nonblock)
 {
@@ -1670,23 +1729,31 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
 	struct iov_iter iter;
 	struct file *file;
 	size_t iov_count;
-	ssize_t read_size, ret;
-
-	ret = io_prep_rw(req, force_nonblock);
-	if (ret)
-		return ret;
-	file = kiocb->ki_filp;
-
-	if (unlikely(!(file->f_mode & FMODE_READ)))
-		return -EBADF;
+	ssize_t io_size, ret;
 
-	ret = io_import_iovec(READ, req, &iovec, &iter);
-	if (ret < 0)
-		return ret;
+	if (!(req->flags & REQ_F_PREPPED)) {
+		ret = io_read_prep(req, &iovec, &iter, force_nonblock);
+		if (ret < 0)
+			return ret;
+	} else {
+		ret = io_import_iovec(READ, req, &iovec, &iter);
+		if (ret < 0)
+			return ret;
+	}
 
-	read_size = ret;
+	file = req->file;
+	io_size = ret;
 	if (req->flags & REQ_F_LINK)
-		req->result = read_size;
+		req->result = io_size;
+
+	/*
+	 * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
+	 * we know to async punt it even if it was opened O_NONBLOCK
+	 */
+	if (force_nonblock && !io_file_supports_async(file)) {
+		req->flags |= REQ_F_MUST_PUNT;
+		goto copy_iov;
+	}
 
 	iov_count = iov_iter_count(&iter);
 	ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count);
@@ -1708,18 +1775,40 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
 		 */
 		if (force_nonblock && !(req->flags & REQ_F_NOWAIT) &&
 		    (req->flags & REQ_F_ISREG) &&
-		    ret2 > 0 && ret2 < read_size)
+		    ret2 > 0 && ret2 < io_size)
 			ret2 = -EAGAIN;
 		/* Catch -EAGAIN return for forced non-blocking submission */
-		if (!force_nonblock || ret2 != -EAGAIN)
+		if (!force_nonblock || ret2 != -EAGAIN) {
 			kiocb_done(kiocb, ret2, nxt, req->in_async);
-		else
-			ret = -EAGAIN;
+		} else {
+copy_iov:
+			ret = io_setup_async_io(req, io_size, iovec,
+						inline_vecs, &iter);
+			if (ret)
+				goto out_free;
+			return -EAGAIN;
+		}
 	}
+out_free:
 	kfree(iovec);
 	return ret;
 }
 
+static int io_write_prep(struct io_kiocb *req, struct iovec **iovec,
+			 struct iov_iter *iter, bool force_nonblock)
+{
+	ssize_t ret;
+
+	ret = io_prep_rw(req, force_nonblock);
+	if (ret)
+		return ret;
+
+	if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
+		return -EBADF;
+
+	return io_import_iovec(WRITE, req, iovec, iter);
+}
+
 static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
 		    bool force_nonblock)
 {
@@ -1728,29 +1817,36 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
 	struct iov_iter iter;
 	struct file *file;
 	size_t iov_count;
-	ssize_t ret;
+	ssize_t ret, io_size;
 
-	ret = io_prep_rw(req, force_nonblock);
-	if (ret)
-		return ret;
+	if (!(req->flags & REQ_F_PREPPED)) {
+		ret = io_write_prep(req, &iovec, &iter, force_nonblock);
+		if (ret < 0)
+			return ret;
+	} else {
+		ret = io_import_iovec(WRITE, req, &iovec, &iter);
+		if (ret < 0)
+			return ret;
+	}
 
 	file = kiocb->ki_filp;
-	if (unlikely(!(file->f_mode & FMODE_WRITE)))
-		return -EBADF;
-
-	ret = io_import_iovec(WRITE, req, &iovec, &iter);
-	if (ret < 0)
-		return ret;
-
+	io_size = ret;
 	if (req->flags & REQ_F_LINK)
-		req->result = ret;
+		req->result = io_size;
 
-	iov_count = iov_iter_count(&iter);
+	/*
+	 * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
+	 * we know to async punt it even if it was opened O_NONBLOCK
+	 */
+	if (force_nonblock && !io_file_supports_async(req->file)) {
+		req->flags |= REQ_F_MUST_PUNT;
+		goto copy_iov;
+	}
 
-	ret = -EAGAIN;
 	if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT))
-		goto out_free;
+		goto copy_iov;
 
+	iov_count = iov_iter_count(&iter);
 	ret = rw_verify_area(WRITE, file, &kiocb->ki_pos, iov_count);
 	if (!ret) {
 		ssize_t ret2;
@@ -1774,10 +1870,16 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
 			ret2 = call_write_iter(file, kiocb, &iter);
 		else
 			ret2 = loop_rw_iter(WRITE, file, kiocb, &iter);
-		if (!force_nonblock || ret2 != -EAGAIN)
+		if (!force_nonblock || ret2 != -EAGAIN) {
 			kiocb_done(kiocb, ret2, nxt, req->in_async);
-		else
-			ret = -EAGAIN;
+		} else {
+copy_iov:
+			ret = io_setup_async_io(req, io_size, iovec,
+						inline_vecs, &iter);
+			if (ret)
+				goto out_free;
+			return -EAGAIN;
+		}
 	}
 out_free:
 	kfree(iovec);
@@ -2603,10 +2705,36 @@ static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	return 0;
 }
 
+static int io_req_defer_prep(struct io_kiocb *req, struct io_async_ctx *io)
+{
+	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
+	struct iov_iter iter;
+	ssize_t ret;
+
+	switch (READ_ONCE(req->sqe->opcode)) {
+	case IORING_OP_READV:
+		ret = io_read_prep(req, &iovec, &iter, true);
+		break;
+	case IORING_OP_WRITEV:
+		ret = io_write_prep(req, &iovec, &iter, true);
+		break;
+	default:
+		return 0;
+	}
+
+	if (ret < 0)
+		return ret;
+
+	req->io = io;
+	io_req_map_io(req, ret, iovec, inline_vecs, &iter);
+	return 0;
+}
+
 static int io_req_defer(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_async_ctx *io;
+	int ret;
 
 	/* Still need defer if there is pending req in defer list. */
 	if (!req_need_defer(req) && list_empty(&ctx->defer_list))
@@ -2623,9 +2751,9 @@ static int io_req_defer(struct io_kiocb *req)
 		return 0;
 	}
 
-	memcpy(&io->sqe, req->sqe, sizeof(io->sqe));
-	req->sqe = &io->sqe;
-	req->io = io;
+	ret = io_req_defer_prep(req, io);
+	if (ret < 0)
+		return ret;
 
 	trace_io_uring_defer(ctx, req, req->user_data);
 	list_add_tail(&req->list, &ctx->defer_list);
@@ -2958,17 +3086,6 @@ static void __io_queue_sqe(struct io_kiocb *req)
 	 */
 	if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
 	    (req->flags & REQ_F_MUST_PUNT))) {
-		struct io_async_ctx *io;
-
-		io = kmalloc(sizeof(*io), GFP_KERNEL);
-		if (!io)
-			goto err;
-
-		memcpy(&io->sqe, req->sqe, sizeof(io->sqe));
-
-		req->sqe = &io->sqe;
-		req->io = io;
-
 		if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) {
 			ret = io_grab_files(req);
 			if (ret)
@@ -3090,9 +3207,9 @@ static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state,
 			goto err_req;
 		}
 
-		memcpy(&io->sqe, req->sqe, sizeof(io->sqe));
-		req->sqe = &io->sqe;
-		req->io = io;
+		ret = io_req_defer_prep(req, io);
+		if (ret)
+			goto err_req;
 		trace_io_uring_link(ctx, req, prev);
 		list_add_tail(&req->list, &prev->link_list);
 	} else if (req->sqe->flags & IOSQE_IO_LINK) {
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2019-12-02 22:31 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2019-12-02 22:30 [PATCHSET 0/2] Ensure all needed read/write data is stable Jens Axboe
2019-12-02 22:31 ` [PATCH 1/2] io_uring: add general async offload context Jens Axboe
2019-12-02 22:31 ` [PATCH 2/2] io_uring: ensure async punted read/write requests copy iovec Jens Axboe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox