public inbox for [email protected]
 help / color / mirror / Atom feed
* [RFC 0/2] fix in-kernel segfault
@ 2019-11-23 22:49 Pavel Begunkov
  2019-11-23 22:49 ` [PATCH 1/2] io_uring: fix dead-hung for non-iter fixed rw Pavel Begunkov
                   ` (3 more replies)
  0 siblings, 4 replies; 7+ messages in thread
From: Pavel Begunkov @ 2019-11-23 22:49 UTC (permalink / raw)
  To: Jens Axboe, io-uring

There is a bug hunging my system when run fixed-link with /dev/urandom
instead of /dev/zero (see patch 1/2).

As for me, the easiest way to fix is to grab mm and use userspace
address for this specific case (as it's done in patches). The other
way is to kmap/vmap, but the first should be short-lived and the
second needs mm anyway.

Ideas how to do it better way? Suggestions and corrections are welcome.

P.S. It seems for some reason it fails a bunch of unrelated tests,
that's POC and not meant to be applied yet.


Pavel Begunkov (2):
  io_uring: fix dead-hung for non-iter fixed rw
  io_uring: fix linked fixed !iter rw

 fs/io_uring.c | 95 ++++++++++++++++++++++++++++++++-------------------
 1 file changed, 60 insertions(+), 35 deletions(-)

-- 
2.24.0


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 1/2] io_uring: fix dead-hung for non-iter fixed rw
  2019-11-23 22:49 [RFC 0/2] fix in-kernel segfault Pavel Begunkov
@ 2019-11-23 22:49 ` Pavel Begunkov
  2019-11-23 22:49 ` [PATCH 2/2] io_uring: fix linked fixed !iter rw Pavel Begunkov
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 7+ messages in thread
From: Pavel Begunkov @ 2019-11-23 22:49 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Read/write requests to devices without implemented read/write_iter
using fixed buffers causes general protection fault, which totally
hanged up a machine.

io_import_fixed() initialises iov_iter as a bvec, but loop_rw_iter()
tries to access its unitialised iov, so dereferencing random address.

Fix it by passing a userspace ptr instead in this case.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 60 +++++++++++++++++++++++++++++++++------------------
 1 file changed, 39 insertions(+), 21 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8119cbae4fb6..566e987c6dab 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -172,6 +172,11 @@ struct io_mapped_ubuf {
 	unsigned int	nr_bvecs;
 };
 
+struct io_ubuf_iter {
+	struct iov_iter	it;
+	u64		ubuf;
+};
+
 struct fixed_file_table {
 	struct file		**files;
 };
@@ -1486,7 +1491,7 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt,
 
 static ssize_t io_import_fixed(struct io_ring_ctx *ctx, int rw,
 				const struct io_uring_sqe *sqe,
-				struct iov_iter *iter)
+				struct io_ubuf_iter *iter)
 {
 	size_t len = READ_ONCE(sqe->len);
 	struct io_mapped_ubuf *imu;
@@ -1513,12 +1518,14 @@ static ssize_t io_import_fixed(struct io_ring_ctx *ctx, int rw,
 	if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
 		return -EFAULT;
 
+	iter->ubuf = buf_addr;
+
 	/*
 	 * May not be a start of buffer, set size appropriately
 	 * and advance us to the beginning.
 	 */
 	offset = buf_addr - imu->ubuf;
-	iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
+	iov_iter_bvec(&iter->it, rw, imu->bvec, imu->nr_bvecs, offset + len);
 
 	if (offset) {
 		/*
@@ -1540,7 +1547,7 @@ static ssize_t io_import_fixed(struct io_ring_ctx *ctx, int rw,
 		const struct bio_vec *bvec = imu->bvec;
 
 		if (offset <= bvec->bv_len) {
-			iov_iter_advance(iter, offset);
+			iov_iter_advance(&iter->it, offset);
 		} else {
 			unsigned long seg_skip;
 
@@ -1548,25 +1555,27 @@ static ssize_t io_import_fixed(struct io_ring_ctx *ctx, int rw,
 			offset -= bvec->bv_len;
 			seg_skip = 1 + (offset >> PAGE_SHIFT);
 
-			iter->bvec = bvec + seg_skip;
-			iter->nr_segs -= seg_skip;
-			iter->count -= bvec->bv_len + offset;
-			iter->iov_offset = offset & ~PAGE_MASK;
+			iter->it.bvec = bvec + seg_skip;
+			iter->it.nr_segs -= seg_skip;
+			iter->it.count -= bvec->bv_len + offset;
+			iter->it.iov_offset = offset & ~PAGE_MASK;
 		}
 	}
 
-	return iter->count;
+	return iter->it.count;
 }
 
 static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw,
 			       const struct sqe_submit *s, struct iovec **iovec,
-			       struct iov_iter *iter)
+			       struct io_ubuf_iter *iter)
 {
 	const struct io_uring_sqe *sqe = s->sqe;
 	void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
 	size_t sqe_len = READ_ONCE(sqe->len);
 	u8 opcode;
 
+	iter->ubuf = 0;
+
 	/*
 	 * We're reading ->opcode for the second time, but the first read
 	 * doesn't care whether it's _FIXED or not, so it doesn't matter
@@ -1587,10 +1596,10 @@ static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw,
 #ifdef CONFIG_COMPAT
 	if (ctx->compat)
 		return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
-						iovec, iter);
+						iovec, &iter->it);
 #endif
 
-	return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
+	return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, &iter->it);
 }
 
 /*
@@ -1598,7 +1607,7 @@ static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw,
  * by looping over ->read() or ->write() manually.
  */
 static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
-			   struct iov_iter *iter)
+			   struct io_ubuf_iter *iter)
 {
 	ssize_t ret = 0;
 
@@ -1612,10 +1621,17 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
 	if (kiocb->ki_flags & IOCB_NOWAIT)
 		return -EAGAIN;
 
-	while (iov_iter_count(iter)) {
-		struct iovec iovec = iov_iter_iovec(iter);
+	while (iov_iter_count(&iter->it)) {
+		struct iovec iovec;
 		ssize_t nr;
 
+		if (iter_is_iovec(&iter->it)) {
+			iovec = iov_iter_iovec(&iter->it);
+		} else {
+			iovec.iov_base = (void __user *)iter->ubuf;
+			iovec.iov_len = iov_iter_count(&iter->it);
+		}
+
 		if (rw == READ) {
 			nr = file->f_op->read(file, iovec.iov_base,
 					      iovec.iov_len, &kiocb->ki_pos);
@@ -1630,9 +1646,11 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
 			break;
 		}
 		ret += nr;
+		iter->ubuf += nr;
+
 		if (nr != iovec.iov_len)
 			break;
-		iov_iter_advance(iter, nr);
+		iov_iter_advance(&iter->it, nr);
 	}
 
 	return ret;
@@ -1643,7 +1661,7 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
 {
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 	struct kiocb *kiocb = &req->rw;
-	struct iov_iter iter;
+	struct io_ubuf_iter iter;
 	struct file *file;
 	size_t iov_count;
 	ssize_t read_size, ret;
@@ -1664,13 +1682,13 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
 	if (req->flags & REQ_F_LINK)
 		req->result = read_size;
 
-	iov_count = iov_iter_count(&iter);
+	iov_count = iov_iter_count(&iter.it);
 	ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count);
 	if (!ret) {
 		ssize_t ret2;
 
 		if (file->f_op->read_iter)
-			ret2 = call_read_iter(file, kiocb, &iter);
+			ret2 = call_read_iter(file, kiocb, &iter.it);
 		else
 			ret2 = loop_rw_iter(READ, file, kiocb, &iter);
 
@@ -1701,7 +1719,7 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
 {
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 	struct kiocb *kiocb = &req->rw;
-	struct iov_iter iter;
+	struct io_ubuf_iter iter;
 	struct file *file;
 	size_t iov_count;
 	ssize_t ret;
@@ -1721,7 +1739,7 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
 	if (req->flags & REQ_F_LINK)
 		req->result = ret;
 
-	iov_count = iov_iter_count(&iter);
+	iov_count = iov_iter_count(&iter.it);
 
 	ret = -EAGAIN;
 	if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT))
@@ -1747,7 +1765,7 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
 		kiocb->ki_flags |= IOCB_WRITE;
 
 		if (file->f_op->write_iter)
-			ret2 = call_write_iter(file, kiocb, &iter);
+			ret2 = call_write_iter(file, kiocb, &iter.it);
 		else
 			ret2 = loop_rw_iter(WRITE, file, kiocb, &iter);
 		if (!force_nonblock || ret2 != -EAGAIN)
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 2/2] io_uring: fix linked fixed !iter rw
  2019-11-23 22:49 [RFC 0/2] fix in-kernel segfault Pavel Begunkov
  2019-11-23 22:49 ` [PATCH 1/2] io_uring: fix dead-hung for non-iter fixed rw Pavel Begunkov
@ 2019-11-23 22:49 ` Pavel Begunkov
  2019-11-23 22:53 ` [RFC 0/2] fix in-kernel segfault Jens Axboe
  2019-11-23 23:08 ` Jens Axboe
  3 siblings, 0 replies; 7+ messages in thread
From: Pavel Begunkov @ 2019-11-23 22:49 UTC (permalink / raw)
  To: Jens Axboe, io-uring

As loop_rw_iter() may need mm even for fixed requests, update
io_req_needs_user(), so the offloading thread and io-wq can handle it as
well.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 41 ++++++++++++++++++++++++-----------------
 1 file changed, 24 insertions(+), 17 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 566e987c6dab..d84b69872967 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -525,12 +525,13 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx)
 	}
 }
 
-static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe)
+static inline bool io_req_needs_user(struct io_kiocb *req)
 {
-	u8 opcode = READ_ONCE(sqe->opcode);
+	struct file *f = req->file;
+	u8 opcode = READ_ONCE(req->submit.sqe->opcode);
 
-	return !(opcode == IORING_OP_READ_FIXED ||
-		 opcode == IORING_OP_WRITE_FIXED);
+	return !((opcode == IORING_OP_READ_FIXED && f->f_op->read_iter) ||
+		(opcode == IORING_OP_WRITE_FIXED && f->f_op->write_iter));
 }
 
 static inline bool io_prep_async_work(struct io_kiocb *req,
@@ -559,7 +560,7 @@ static inline bool io_prep_async_work(struct io_kiocb *req,
 				req->work.flags |= IO_WQ_WORK_UNBOUND;
 			break;
 		}
-		if (io_sqe_needs_user(req->submit.sqe))
+		if (io_req_needs_user(req))
 			req->work.flags |= IO_WQ_WORK_NEEDS_USER;
 	}
 
@@ -1625,11 +1626,11 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
 		struct iovec iovec;
 		ssize_t nr;
 
-		if (iter_is_iovec(&iter->it)) {
-			iovec = iov_iter_iovec(&iter->it);
-		} else {
+		if (iov_iter_is_bvec(&iter->it)) {
 			iovec.iov_base = (void __user *)iter->ubuf;
 			iovec.iov_len = iov_iter_count(&iter->it);
+		} else {
+			iovec = iov_iter_iovec(&iter->it);
 		}
 
 		if (rw == READ) {
@@ -3041,14 +3042,6 @@ static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state,
 		goto err_req;
 	}
 
-	ret = io_req_set_file(state, req);
-	if (unlikely(ret)) {
-err_req:
-		io_cqring_add_event(req, ret);
-		io_double_put_req(req);
-		return;
-	}
-
 	/*
 	 * If we already have a head request, queue this one for async
 	 * submittal once the head completes. If we don't have a head but
@@ -3092,6 +3085,11 @@ static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state,
 	} else {
 		io_queue_sqe(req);
 	}
+
+	return;
+err_req:
+	io_cqring_add_event(req, ret);
+	io_double_put_req(req);
 }
 
 /*
@@ -3197,6 +3195,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 	for (i = 0; i < nr; i++) {
 		struct io_kiocb *req;
 		unsigned int sqe_flags;
+		int ret;
 
 		req = io_get_req(ctx, statep);
 		if (unlikely(!req)) {
@@ -3209,7 +3208,14 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 			break;
 		}
 
-		if (io_sqe_needs_user(req->submit.sqe) && !*mm) {
+		ret = io_req_set_file(statep, req);
+		if (unlikely(ret)) {
+			io_cqring_add_event(req, ret);
+			__io_free_req(req);
+			break;
+		}
+
+		if (io_req_needs_user(req) && !*mm) {
 			mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm);
 			if (!mm_fault) {
 				use_mm(ctx->sqo_mm);
@@ -3217,6 +3223,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 			}
 		}
 
+
 		sqe_flags = req->submit.sqe->flags;
 
 		req->submit.ring_file = ring_file;
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [RFC 0/2] fix in-kernel segfault
  2019-11-23 22:49 [RFC 0/2] fix in-kernel segfault Pavel Begunkov
  2019-11-23 22:49 ` [PATCH 1/2] io_uring: fix dead-hung for non-iter fixed rw Pavel Begunkov
  2019-11-23 22:49 ` [PATCH 2/2] io_uring: fix linked fixed !iter rw Pavel Begunkov
@ 2019-11-23 22:53 ` Jens Axboe
  2019-11-23 23:08 ` Jens Axboe
  3 siblings, 0 replies; 7+ messages in thread
From: Jens Axboe @ 2019-11-23 22:53 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring

On 11/23/19 3:49 PM, Pavel Begunkov wrote:
> There is a bug hunging my system when run fixed-link with /dev/urandom
> instead of /dev/zero (see patch 1/2).
> 
> As for me, the easiest way to fix is to grab mm and use userspace
> address for this specific case (as it's done in patches). The other
> way is to kmap/vmap, but the first should be short-lived and the
> second needs mm anyway.
> 
> Ideas how to do it better way? Suggestions and corrections are welcome.
> 
> P.S. It seems for some reason it fails a bunch of unrelated tests,
> that's POC and not meant to be applied yet.

I'll take a look at this, but probably not until tomorrow afternoon
or Monday.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC 0/2] fix in-kernel segfault
  2019-11-23 22:49 [RFC 0/2] fix in-kernel segfault Pavel Begunkov
                   ` (2 preceding siblings ...)
  2019-11-23 22:53 ` [RFC 0/2] fix in-kernel segfault Jens Axboe
@ 2019-11-23 23:08 ` Jens Axboe
  2019-11-24  8:57   ` Pavel Begunkov
  3 siblings, 1 reply; 7+ messages in thread
From: Jens Axboe @ 2019-11-23 23:08 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring

On 11/23/19 3:49 PM, Pavel Begunkov wrote:
> There is a bug hunging my system when run fixed-link with /dev/urandom
> instead of /dev/zero (see patch 1/2).
> 
> As for me, the easiest way to fix is to grab mm and use userspace
> address for this specific case (as it's done in patches). The other
> way is to kmap/vmap, but the first should be short-lived and the
> second needs mm anyway.
> 
> Ideas how to do it better way? Suggestions and corrections are welcome.

OK, took a quick look. kmap() etc doesn't need context, but the copy
does. How about just ensuring we grab the mm for cases that don't have
->read_iter() or ->write_iter() and then just map and copy in that
loop that handles that exact case? I think that's cleaner than what
you have.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC 0/2] fix in-kernel segfault
  2019-11-23 23:08 ` Jens Axboe
@ 2019-11-24  8:57   ` Pavel Begunkov
  2019-11-24 16:36     ` Jens Axboe
  0 siblings, 1 reply; 7+ messages in thread
From: Pavel Begunkov @ 2019-11-24  8:57 UTC (permalink / raw)
  To: Jens Axboe, io-uring


[-- Attachment #1.1: Type: text/plain, Size: 1131 bytes --]

On 24/11/2019 02:08, Jens Axboe wrote:
> On 11/23/19 3:49 PM, Pavel Begunkov wrote:
>> There is a bug hunging my system when run fixed-link with /dev/urandom
>> instead of /dev/zero (see patch 1/2).
>>
>> As for me, the easiest way to fix is to grab mm and use userspace
>> address for this specific case (as it's done in patches). The other
>> way is to kmap/vmap, but the first should be short-lived and the
>> second needs mm anyway.
>>
>> Ideas how to do it better way? Suggestions and corrections are welcome.
> 
> OK, took a quick look. kmap() etc doesn't need context, but the copy

Thanks! What copy do you mean? The first and pretty short version was
with kmap.
e.g. while(count) { read(kmap()); ...; knumap(); }

I'll send this shortly. What I don't like here, is that it passes
kmapped virtual address as "void __user *". Is that ok?
	

> does. How about just ensuring we grab the mm for cases that don't have
> ->read_iter() or ->write_iter() and then just map and copy in that
> loop that handles that exact case? I think that's cleaner than what
> you have.
> 

-- 
Pavel Begunkov




[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC 0/2] fix in-kernel segfault
  2019-11-24  8:57   ` Pavel Begunkov
@ 2019-11-24 16:36     ` Jens Axboe
  0 siblings, 0 replies; 7+ messages in thread
From: Jens Axboe @ 2019-11-24 16:36 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring

On 11/24/19 1:57 AM, Pavel Begunkov wrote:
> On 24/11/2019 02:08, Jens Axboe wrote:
>> On 11/23/19 3:49 PM, Pavel Begunkov wrote:
>>> There is a bug hunging my system when run fixed-link with /dev/urandom
>>> instead of /dev/zero (see patch 1/2).
>>>
>>> As for me, the easiest way to fix is to grab mm and use userspace
>>> address for this specific case (as it's done in patches). The other
>>> way is to kmap/vmap, but the first should be short-lived and the
>>> second needs mm anyway.
>>>
>>> Ideas how to do it better way? Suggestions and corrections are welcome.
>>
>> OK, took a quick look. kmap() etc doesn't need context, but the copy
> 
> Thanks! What copy do you mean? The first and pretty short version was
> with kmap.
> e.g. while(count) { read(kmap()); ...; knumap(); }
> 
> I'll send this shortly. What I don't like here, is that it passes
> kmapped virtual address as "void __user *". Is that ok?

I think that's OK, it is a user address, after all. Stripping the other
way is usually a bigger concern :-)

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2019-11-24 16:36 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2019-11-23 22:49 [RFC 0/2] fix in-kernel segfault Pavel Begunkov
2019-11-23 22:49 ` [PATCH 1/2] io_uring: fix dead-hung for non-iter fixed rw Pavel Begunkov
2019-11-23 22:49 ` [PATCH 2/2] io_uring: fix linked fixed !iter rw Pavel Begunkov
2019-11-23 22:53 ` [RFC 0/2] fix in-kernel segfault Jens Axboe
2019-11-23 23:08 ` Jens Axboe
2019-11-24  8:57   ` Pavel Begunkov
2019-11-24 16:36     ` Jens Axboe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox