* [RFC 0/2] fix in-kernel segfault
@ 2019-11-23 22:49 Pavel Begunkov
2019-11-23 22:49 ` [PATCH 1/2] io_uring: fix dead-hung for non-iter fixed rw Pavel Begunkov
` (3 more replies)
0 siblings, 4 replies; 7+ messages in thread
From: Pavel Begunkov @ 2019-11-23 22:49 UTC (permalink / raw)
To: Jens Axboe, io-uring
There is a bug hunging my system when run fixed-link with /dev/urandom
instead of /dev/zero (see patch 1/2).
As for me, the easiest way to fix is to grab mm and use userspace
address for this specific case (as it's done in patches). The other
way is to kmap/vmap, but the first should be short-lived and the
second needs mm anyway.
Ideas how to do it better way? Suggestions and corrections are welcome.
P.S. It seems for some reason it fails a bunch of unrelated tests,
that's POC and not meant to be applied yet.
Pavel Begunkov (2):
io_uring: fix dead-hung for non-iter fixed rw
io_uring: fix linked fixed !iter rw
fs/io_uring.c | 95 ++++++++++++++++++++++++++++++++-------------------
1 file changed, 60 insertions(+), 35 deletions(-)
--
2.24.0
^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH 1/2] io_uring: fix dead-hung for non-iter fixed rw
2019-11-23 22:49 [RFC 0/2] fix in-kernel segfault Pavel Begunkov
@ 2019-11-23 22:49 ` Pavel Begunkov
2019-11-23 22:49 ` [PATCH 2/2] io_uring: fix linked fixed !iter rw Pavel Begunkov
` (2 subsequent siblings)
3 siblings, 0 replies; 7+ messages in thread
From: Pavel Begunkov @ 2019-11-23 22:49 UTC (permalink / raw)
To: Jens Axboe, io-uring
Read/write requests to devices without implemented read/write_iter
using fixed buffers causes general protection fault, which totally
hanged up a machine.
io_import_fixed() initialises iov_iter as a bvec, but loop_rw_iter()
tries to access its unitialised iov, so dereferencing random address.
Fix it by passing a userspace ptr instead in this case.
Signed-off-by: Pavel Begunkov <[email protected]>
---
fs/io_uring.c | 60 +++++++++++++++++++++++++++++++++------------------
1 file changed, 39 insertions(+), 21 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8119cbae4fb6..566e987c6dab 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -172,6 +172,11 @@ struct io_mapped_ubuf {
unsigned int nr_bvecs;
};
+struct io_ubuf_iter {
+ struct iov_iter it;
+ u64 ubuf;
+};
+
struct fixed_file_table {
struct file **files;
};
@@ -1486,7 +1491,7 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt,
static ssize_t io_import_fixed(struct io_ring_ctx *ctx, int rw,
const struct io_uring_sqe *sqe,
- struct iov_iter *iter)
+ struct io_ubuf_iter *iter)
{
size_t len = READ_ONCE(sqe->len);
struct io_mapped_ubuf *imu;
@@ -1513,12 +1518,14 @@ static ssize_t io_import_fixed(struct io_ring_ctx *ctx, int rw,
if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
return -EFAULT;
+ iter->ubuf = buf_addr;
+
/*
* May not be a start of buffer, set size appropriately
* and advance us to the beginning.
*/
offset = buf_addr - imu->ubuf;
- iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
+ iov_iter_bvec(&iter->it, rw, imu->bvec, imu->nr_bvecs, offset + len);
if (offset) {
/*
@@ -1540,7 +1547,7 @@ static ssize_t io_import_fixed(struct io_ring_ctx *ctx, int rw,
const struct bio_vec *bvec = imu->bvec;
if (offset <= bvec->bv_len) {
- iov_iter_advance(iter, offset);
+ iov_iter_advance(&iter->it, offset);
} else {
unsigned long seg_skip;
@@ -1548,25 +1555,27 @@ static ssize_t io_import_fixed(struct io_ring_ctx *ctx, int rw,
offset -= bvec->bv_len;
seg_skip = 1 + (offset >> PAGE_SHIFT);
- iter->bvec = bvec + seg_skip;
- iter->nr_segs -= seg_skip;
- iter->count -= bvec->bv_len + offset;
- iter->iov_offset = offset & ~PAGE_MASK;
+ iter->it.bvec = bvec + seg_skip;
+ iter->it.nr_segs -= seg_skip;
+ iter->it.count -= bvec->bv_len + offset;
+ iter->it.iov_offset = offset & ~PAGE_MASK;
}
}
- return iter->count;
+ return iter->it.count;
}
static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw,
const struct sqe_submit *s, struct iovec **iovec,
- struct iov_iter *iter)
+ struct io_ubuf_iter *iter)
{
const struct io_uring_sqe *sqe = s->sqe;
void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
size_t sqe_len = READ_ONCE(sqe->len);
u8 opcode;
+ iter->ubuf = 0;
+
/*
* We're reading ->opcode for the second time, but the first read
* doesn't care whether it's _FIXED or not, so it doesn't matter
@@ -1587,10 +1596,10 @@ static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw,
#ifdef CONFIG_COMPAT
if (ctx->compat)
return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
- iovec, iter);
+ iovec, &iter->it);
#endif
- return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
+ return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, &iter->it);
}
/*
@@ -1598,7 +1607,7 @@ static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw,
* by looping over ->read() or ->write() manually.
*/
static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
- struct iov_iter *iter)
+ struct io_ubuf_iter *iter)
{
ssize_t ret = 0;
@@ -1612,10 +1621,17 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
if (kiocb->ki_flags & IOCB_NOWAIT)
return -EAGAIN;
- while (iov_iter_count(iter)) {
- struct iovec iovec = iov_iter_iovec(iter);
+ while (iov_iter_count(&iter->it)) {
+ struct iovec iovec;
ssize_t nr;
+ if (iter_is_iovec(&iter->it)) {
+ iovec = iov_iter_iovec(&iter->it);
+ } else {
+ iovec.iov_base = (void __user *)iter->ubuf;
+ iovec.iov_len = iov_iter_count(&iter->it);
+ }
+
if (rw == READ) {
nr = file->f_op->read(file, iovec.iov_base,
iovec.iov_len, &kiocb->ki_pos);
@@ -1630,9 +1646,11 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
break;
}
ret += nr;
+ iter->ubuf += nr;
+
if (nr != iovec.iov_len)
break;
- iov_iter_advance(iter, nr);
+ iov_iter_advance(&iter->it, nr);
}
return ret;
@@ -1643,7 +1661,7 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
{
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct kiocb *kiocb = &req->rw;
- struct iov_iter iter;
+ struct io_ubuf_iter iter;
struct file *file;
size_t iov_count;
ssize_t read_size, ret;
@@ -1664,13 +1682,13 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
if (req->flags & REQ_F_LINK)
req->result = read_size;
- iov_count = iov_iter_count(&iter);
+ iov_count = iov_iter_count(&iter.it);
ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count);
if (!ret) {
ssize_t ret2;
if (file->f_op->read_iter)
- ret2 = call_read_iter(file, kiocb, &iter);
+ ret2 = call_read_iter(file, kiocb, &iter.it);
else
ret2 = loop_rw_iter(READ, file, kiocb, &iter);
@@ -1701,7 +1719,7 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
{
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct kiocb *kiocb = &req->rw;
- struct iov_iter iter;
+ struct io_ubuf_iter iter;
struct file *file;
size_t iov_count;
ssize_t ret;
@@ -1721,7 +1739,7 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
if (req->flags & REQ_F_LINK)
req->result = ret;
- iov_count = iov_iter_count(&iter);
+ iov_count = iov_iter_count(&iter.it);
ret = -EAGAIN;
if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT))
@@ -1747,7 +1765,7 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
kiocb->ki_flags |= IOCB_WRITE;
if (file->f_op->write_iter)
- ret2 = call_write_iter(file, kiocb, &iter);
+ ret2 = call_write_iter(file, kiocb, &iter.it);
else
ret2 = loop_rw_iter(WRITE, file, kiocb, &iter);
if (!force_nonblock || ret2 != -EAGAIN)
--
2.24.0
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH 2/2] io_uring: fix linked fixed !iter rw
2019-11-23 22:49 [RFC 0/2] fix in-kernel segfault Pavel Begunkov
2019-11-23 22:49 ` [PATCH 1/2] io_uring: fix dead-hung for non-iter fixed rw Pavel Begunkov
@ 2019-11-23 22:49 ` Pavel Begunkov
2019-11-23 22:53 ` [RFC 0/2] fix in-kernel segfault Jens Axboe
2019-11-23 23:08 ` Jens Axboe
3 siblings, 0 replies; 7+ messages in thread
From: Pavel Begunkov @ 2019-11-23 22:49 UTC (permalink / raw)
To: Jens Axboe, io-uring
As loop_rw_iter() may need mm even for fixed requests, update
io_req_needs_user(), so the offloading thread and io-wq can handle it as
well.
Signed-off-by: Pavel Begunkov <[email protected]>
---
fs/io_uring.c | 41 ++++++++++++++++++++++++-----------------
1 file changed, 24 insertions(+), 17 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 566e987c6dab..d84b69872967 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -525,12 +525,13 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx)
}
}
-static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe)
+static inline bool io_req_needs_user(struct io_kiocb *req)
{
- u8 opcode = READ_ONCE(sqe->opcode);
+ struct file *f = req->file;
+ u8 opcode = READ_ONCE(req->submit.sqe->opcode);
- return !(opcode == IORING_OP_READ_FIXED ||
- opcode == IORING_OP_WRITE_FIXED);
+ return !((opcode == IORING_OP_READ_FIXED && f->f_op->read_iter) ||
+ (opcode == IORING_OP_WRITE_FIXED && f->f_op->write_iter));
}
static inline bool io_prep_async_work(struct io_kiocb *req,
@@ -559,7 +560,7 @@ static inline bool io_prep_async_work(struct io_kiocb *req,
req->work.flags |= IO_WQ_WORK_UNBOUND;
break;
}
- if (io_sqe_needs_user(req->submit.sqe))
+ if (io_req_needs_user(req))
req->work.flags |= IO_WQ_WORK_NEEDS_USER;
}
@@ -1625,11 +1626,11 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
struct iovec iovec;
ssize_t nr;
- if (iter_is_iovec(&iter->it)) {
- iovec = iov_iter_iovec(&iter->it);
- } else {
+ if (iov_iter_is_bvec(&iter->it)) {
iovec.iov_base = (void __user *)iter->ubuf;
iovec.iov_len = iov_iter_count(&iter->it);
+ } else {
+ iovec = iov_iter_iovec(&iter->it);
}
if (rw == READ) {
@@ -3041,14 +3042,6 @@ static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state,
goto err_req;
}
- ret = io_req_set_file(state, req);
- if (unlikely(ret)) {
-err_req:
- io_cqring_add_event(req, ret);
- io_double_put_req(req);
- return;
- }
-
/*
* If we already have a head request, queue this one for async
* submittal once the head completes. If we don't have a head but
@@ -3092,6 +3085,11 @@ static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state,
} else {
io_queue_sqe(req);
}
+
+ return;
+err_req:
+ io_cqring_add_event(req, ret);
+ io_double_put_req(req);
}
/*
@@ -3197,6 +3195,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
for (i = 0; i < nr; i++) {
struct io_kiocb *req;
unsigned int sqe_flags;
+ int ret;
req = io_get_req(ctx, statep);
if (unlikely(!req)) {
@@ -3209,7 +3208,14 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
break;
}
- if (io_sqe_needs_user(req->submit.sqe) && !*mm) {
+ ret = io_req_set_file(statep, req);
+ if (unlikely(ret)) {
+ io_cqring_add_event(req, ret);
+ __io_free_req(req);
+ break;
+ }
+
+ if (io_req_needs_user(req) && !*mm) {
mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm);
if (!mm_fault) {
use_mm(ctx->sqo_mm);
@@ -3217,6 +3223,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
}
}
+
sqe_flags = req->submit.sqe->flags;
req->submit.ring_file = ring_file;
--
2.24.0
^ permalink raw reply related [flat|nested] 7+ messages in thread
* Re: [RFC 0/2] fix in-kernel segfault
2019-11-23 22:49 [RFC 0/2] fix in-kernel segfault Pavel Begunkov
2019-11-23 22:49 ` [PATCH 1/2] io_uring: fix dead-hung for non-iter fixed rw Pavel Begunkov
2019-11-23 22:49 ` [PATCH 2/2] io_uring: fix linked fixed !iter rw Pavel Begunkov
@ 2019-11-23 22:53 ` Jens Axboe
2019-11-23 23:08 ` Jens Axboe
3 siblings, 0 replies; 7+ messages in thread
From: Jens Axboe @ 2019-11-23 22:53 UTC (permalink / raw)
To: Pavel Begunkov, io-uring
On 11/23/19 3:49 PM, Pavel Begunkov wrote:
> There is a bug hunging my system when run fixed-link with /dev/urandom
> instead of /dev/zero (see patch 1/2).
>
> As for me, the easiest way to fix is to grab mm and use userspace
> address for this specific case (as it's done in patches). The other
> way is to kmap/vmap, but the first should be short-lived and the
> second needs mm anyway.
>
> Ideas how to do it better way? Suggestions and corrections are welcome.
>
> P.S. It seems for some reason it fails a bunch of unrelated tests,
> that's POC and not meant to be applied yet.
I'll take a look at this, but probably not until tomorrow afternoon
or Monday.
--
Jens Axboe
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [RFC 0/2] fix in-kernel segfault
2019-11-23 22:49 [RFC 0/2] fix in-kernel segfault Pavel Begunkov
` (2 preceding siblings ...)
2019-11-23 22:53 ` [RFC 0/2] fix in-kernel segfault Jens Axboe
@ 2019-11-23 23:08 ` Jens Axboe
2019-11-24 8:57 ` Pavel Begunkov
3 siblings, 1 reply; 7+ messages in thread
From: Jens Axboe @ 2019-11-23 23:08 UTC (permalink / raw)
To: Pavel Begunkov, io-uring
On 11/23/19 3:49 PM, Pavel Begunkov wrote:
> There is a bug hunging my system when run fixed-link with /dev/urandom
> instead of /dev/zero (see patch 1/2).
>
> As for me, the easiest way to fix is to grab mm and use userspace
> address for this specific case (as it's done in patches). The other
> way is to kmap/vmap, but the first should be short-lived and the
> second needs mm anyway.
>
> Ideas how to do it better way? Suggestions and corrections are welcome.
OK, took a quick look. kmap() etc doesn't need context, but the copy
does. How about just ensuring we grab the mm for cases that don't have
->read_iter() or ->write_iter() and then just map and copy in that
loop that handles that exact case? I think that's cleaner than what
you have.
--
Jens Axboe
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [RFC 0/2] fix in-kernel segfault
2019-11-23 23:08 ` Jens Axboe
@ 2019-11-24 8:57 ` Pavel Begunkov
2019-11-24 16:36 ` Jens Axboe
0 siblings, 1 reply; 7+ messages in thread
From: Pavel Begunkov @ 2019-11-24 8:57 UTC (permalink / raw)
To: Jens Axboe, io-uring
[-- Attachment #1.1: Type: text/plain, Size: 1131 bytes --]
On 24/11/2019 02:08, Jens Axboe wrote:
> On 11/23/19 3:49 PM, Pavel Begunkov wrote:
>> There is a bug hunging my system when run fixed-link with /dev/urandom
>> instead of /dev/zero (see patch 1/2).
>>
>> As for me, the easiest way to fix is to grab mm and use userspace
>> address for this specific case (as it's done in patches). The other
>> way is to kmap/vmap, but the first should be short-lived and the
>> second needs mm anyway.
>>
>> Ideas how to do it better way? Suggestions and corrections are welcome.
>
> OK, took a quick look. kmap() etc doesn't need context, but the copy
Thanks! What copy do you mean? The first and pretty short version was
with kmap.
e.g. while(count) { read(kmap()); ...; knumap(); }
I'll send this shortly. What I don't like here, is that it passes
kmapped virtual address as "void __user *". Is that ok?
> does. How about just ensuring we grab the mm for cases that don't have
> ->read_iter() or ->write_iter() and then just map and copy in that
> loop that handles that exact case? I think that's cleaner than what
> you have.
>
--
Pavel Begunkov
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [RFC 0/2] fix in-kernel segfault
2019-11-24 8:57 ` Pavel Begunkov
@ 2019-11-24 16:36 ` Jens Axboe
0 siblings, 0 replies; 7+ messages in thread
From: Jens Axboe @ 2019-11-24 16:36 UTC (permalink / raw)
To: Pavel Begunkov, io-uring
On 11/24/19 1:57 AM, Pavel Begunkov wrote:
> On 24/11/2019 02:08, Jens Axboe wrote:
>> On 11/23/19 3:49 PM, Pavel Begunkov wrote:
>>> There is a bug hunging my system when run fixed-link with /dev/urandom
>>> instead of /dev/zero (see patch 1/2).
>>>
>>> As for me, the easiest way to fix is to grab mm and use userspace
>>> address for this specific case (as it's done in patches). The other
>>> way is to kmap/vmap, but the first should be short-lived and the
>>> second needs mm anyway.
>>>
>>> Ideas how to do it better way? Suggestions and corrections are welcome.
>>
>> OK, took a quick look. kmap() etc doesn't need context, but the copy
>
> Thanks! What copy do you mean? The first and pretty short version was
> with kmap.
> e.g. while(count) { read(kmap()); ...; knumap(); }
>
> I'll send this shortly. What I don't like here, is that it passes
> kmapped virtual address as "void __user *". Is that ok?
I think that's OK, it is a user address, after all. Stripping the other
way is usually a bigger concern :-)
--
Jens Axboe
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2019-11-24 16:36 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2019-11-23 22:49 [RFC 0/2] fix in-kernel segfault Pavel Begunkov
2019-11-23 22:49 ` [PATCH 1/2] io_uring: fix dead-hung for non-iter fixed rw Pavel Begunkov
2019-11-23 22:49 ` [PATCH 2/2] io_uring: fix linked fixed !iter rw Pavel Begunkov
2019-11-23 22:53 ` [RFC 0/2] fix in-kernel segfault Jens Axboe
2019-11-23 23:08 ` Jens Axboe
2019-11-24 8:57 ` Pavel Begunkov
2019-11-24 16:36 ` Jens Axboe
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox