public inbox for [email protected]
 help / color / mirror / Atom feed
* [io-uring] WARNING in io_fill_cqe_req_aux
@ 2024-06-07 17:07 chase xd
  2024-06-12  1:11 ` Pavel Begunkov
  0 siblings, 1 reply; 5+ messages in thread
From: chase xd @ 2024-06-07 17:07 UTC (permalink / raw)
  To: axboe, asml.silence, io-uring, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 3799 bytes --]

Dear Linux kernel maintainers,

Syzkaller reports this previously unknown bug on Linux
6.8.0-rc3-00043-ga69d20885494-dirty #4. Seems like the bug was
silently or unintendedly fixed in the latest version.

```
Syzkaller hit 'WARNING in io_fill_cqe_req_aux' bug.

------------[ cut here ]------------
WARNING: CPU: 7 PID: 8369 at io_uring/io_uring.h:132
io_lockdep_assert_cq_locked+0x2c7/0x340 io_uring/io_uring.h:132
Modules linked in:
CPU: 7 PID: 8369 Comm: syz-executor263 Not tainted
6.8.0-rc3-00043-ga69d20885494-dirty #4
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014
RIP: 0010:io_lockdep_assert_cq_locked+0x2c7/0x340 io_uring/io_uring.h:132
Code: 48 8d bb 98 03 00 00 be ff ff ff ff e8 52 45 4b 06 31 ff 89 c3
89 c6 e8 b7 e2 2d fd 85 db 0f 85 d5 fe ff ff e8 0a e7 2d fd 90 <0f> 0b
90 e9 c7 fe ff ff e8 fc e6 2d fd e8 c7 38 fa fc 48 85 c0 0f
RSP: 0018:ffffc90012af79a8 EFLAGS: 00010293
RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffffff845cf059
RDX: ffff8880252ea440 RSI: ffffffff845cf066 RDI: 0000000000000005
RBP: 0000000000000000 R08: 0000000000000005 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000001
R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000001
FS:  00005555570e13c0(0000) GS:ffff88823bd80000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f1bdbcae020 CR3: 0000000022624000 CR4: 0000000000750ef0
PKRU: 55555554
Call Trace:
 <TASK>
 io_fill_cqe_req_aux+0xd6/0x1f0 io_uring/io_uring.c:925
 io_poll_check_events io_uring/poll.c:325 [inline]
 io_poll_task_func+0x16f/0x1000 io_uring/poll.c:357
 io_handle_tw_list+0x172/0x560 io_uring/io_uring.c:1154
 tctx_task_work_run+0xaa/0x330 io_uring/io_uring.c:1226
 tctx_task_work+0x7b/0xd0 io_uring/io_uring.c:1244
 task_work_run+0x16d/0x260 kernel/task_work.c:180
 get_signal+0x1cb/0x25a0 kernel/signal.c:2669
 arch_do_signal_or_restart+0x81/0x7e0 arch/x86/kernel/signal.c:310
 exit_to_user_mode_loop kernel/entry/common.c:105 [inline]
 exit_to_user_mode_prepare include/linux/entry-common.h:328 [inline]
 __syscall_exit_to_user_mode_work kernel/entry/common.c:201 [inline]
 syscall_exit_to_user_mode+0x156/0x2b0 kernel/entry/common.c:212
 do_syscall_64+0xe5/0x270 arch/x86/entry/common.c:89
 entry_SYSCALL_64_after_hwframe+0x6f/0x77
RIP: 0033:0x7f1bdbc2d88d
Code: c3 e8 a7 1f 00 00 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 89 f8 48
89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d
01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007ffd12f6fa18 EFLAGS: 00000246 ORIG_RAX: 00000000000001aa
RAX: 0000000000000001 RBX: 000000000000220b RCX: 00007f1bdbc2d88d
RDX: 0000000000000000 RSI: 0000000000005012 RDI: 0000000000000003
RBP: 0000000000000003 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000001
R13: 431bde82d7b634db R14: 00007f1bdbcaa4f0 R15: 0000000000000001
 </TASK>


Syzkaller reproducer:
# {Threaded:false Repeat:true RepeatTimes:0 Procs:1 Slowdown:1
Sandbox: SandboxArg:0 Leak:false NetInjection:false NetDevices:false
NetReset:false Cgroups:false BinfmtMisc:false CloseFDs:false
KCSAN:false DevlinkPCI:false NicVF:false USB:false VhciInjection:false
Wifi:false IEEE802154:false Sysctl:false Swap:false UseTmpDir:false
HandleSegv:false Repro:false Trace:false LegacyOptions:{Collide:false
Fault:false FaultCall:0 FaultNth:0}}
r0 = syz_io_uring_setup(0x220b, &(0x7f0000000000)={0x0, 0x63db,
0x10000, 0x800}, &(0x7f0000000080)=<r1=>0x0,
&(0x7f0000000200)=<r2=>0x0)
r3 = socket$inet(0x2, 0x1, 0x0)
syz_io_uring_submit(r1, r2,
&(0x7f0000000a80)=@IORING_OP_POLL_ADD={0x6, 0x0, 0x0, @fd=r3, 0x0,
0x0, 0x1})
io_uring_enter(r0, 0x5012, 0x0, 0x0, 0x0, 0x0)
```

crepro is in the attachment.

Best Regards
Xdchase

[-- Attachment #2: repro.c --]
[-- Type: application/octet-stream, Size: 15624 bytes --]

#define _GNU_SOURCE

#include <dirent.h>
#include <endian.h>
#include <errno.h>
#include <fcntl.h>
#include <setjmp.h>
#include <signal.h>
#include <stdarg.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/prctl.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <time.h>
#include <unistd.h>

#ifndef __NR_io_uring_enter
#define __NR_io_uring_enter 426
#endif
#ifndef __NR_io_uring_register
#define __NR_io_uring_register 427
#endif
#ifndef __NR_io_uring_setup
#define __NR_io_uring_setup 425
#endif

static __thread int clone_ongoing;
static __thread int skip_segv;
static __thread jmp_buf segv_env;

static void segv_handler(int sig, siginfo_t* info, void* ctx)
{
  if (__atomic_load_n(&clone_ongoing, __ATOMIC_RELAXED) != 0) {
    exit(sig);
  }
  uintptr_t addr = (uintptr_t)info->si_addr;
  const uintptr_t prog_start = 1 << 20;
  const uintptr_t prog_end = 100 << 20;
  int skip = __atomic_load_n(&skip_segv, __ATOMIC_RELAXED) != 0;
  int valid = addr < prog_start || addr > prog_end;
  if (skip && valid) {
    _longjmp(segv_env, 1);
  }
  exit(sig);
}

static void install_segv_handler(void)
{
  struct sigaction sa;
  memset(&sa, 0, sizeof(sa));
  sa.sa_handler = SIG_IGN;
  syscall(SYS_rt_sigaction, 0x20, &sa, NULL, 8);
  syscall(SYS_rt_sigaction, 0x21, &sa, NULL, 8);
  memset(&sa, 0, sizeof(sa));
  sa.sa_sigaction = segv_handler;
  sa.sa_flags = SA_NODEFER | SA_SIGINFO;
  sigaction(SIGSEGV, &sa, NULL);
  sigaction(SIGBUS, &sa, NULL);
}

#define NONFAILING(...)                                                        \
  ({                                                                           \
    int ok = 1;                                                                \
    __atomic_fetch_add(&skip_segv, 1, __ATOMIC_SEQ_CST);                       \
    if (_setjmp(segv_env) == 0) {                                              \
      __VA_ARGS__;                                                             \
    } else                                                                     \
      ok = 0;                                                                  \
    __atomic_fetch_sub(&skip_segv, 1, __ATOMIC_SEQ_CST);                       \
    ok;                                                                        \
  })

static void sleep_ms(uint64_t ms)
{
  usleep(ms * 1000);
}

static uint64_t current_time_ms(void)
{
  struct timespec ts;
  if (clock_gettime(CLOCK_MONOTONIC, &ts))
    exit(1);
  return (uint64_t)ts.tv_sec * 1000 + (uint64_t)ts.tv_nsec / 1000000;
}

static bool write_file(const char* file, const char* what, ...)
{
  char buf[1024];
  va_list args;
  va_start(args, what);
  vsnprintf(buf, sizeof(buf), what, args);
  va_end(args);
  buf[sizeof(buf) - 1] = 0;
  int len = strlen(buf);
  int fd = open(file, O_WRONLY | O_CLOEXEC);
  if (fd == -1)
    return false;
  if (write(fd, buf, len) != len) {
    int err = errno;
    close(fd);
    errno = err;
    return false;
  }
  close(fd);
  return true;
}

#define SIZEOF_IO_URING_SQE 64
#define SIZEOF_IO_URING_CQE 16
#define SQ_HEAD_OFFSET 0
#define SQ_TAIL_OFFSET 64
#define SQ_RING_MASK_OFFSET 256
#define SQ_RING_ENTRIES_OFFSET 264
#define SQ_FLAGS_OFFSET 276
#define SQ_DROPPED_OFFSET 272
#define CQ_HEAD_OFFSET 128
#define CQ_TAIL_OFFSET 192
#define CQ_RING_MASK_OFFSET 260
#define CQ_RING_ENTRIES_OFFSET 268
#define CQ_RING_OVERFLOW_OFFSET 284
#define CQ_FLAGS_OFFSET 280
#define CQ_CQES_OFFSET 320

// From linux/io_uring.h
struct io_uring_cqe {
  uint64_t user_data;
  uint32_t res;
  uint32_t flags;
};

/* This is x86 specific */
#define read_barrier() __asm__ __volatile__("" ::: "memory")
#define write_barrier() __asm__ __volatile__("" ::: "memory")

struct io_sqring_offsets {
  uint32_t head;
  uint32_t tail;
  uint32_t ring_mask;
  uint32_t ring_entries;
  uint32_t flags;
  uint32_t dropped;
  uint32_t array;
  uint32_t resv1;
  uint64_t resv2;
};

struct io_cqring_offsets {
  uint32_t head;
  uint32_t tail;
  uint32_t ring_mask;
  uint32_t ring_entries;
  uint32_t overflow;
  uint32_t cqes;
  uint64_t resv[2];
};

struct io_uring_params {
  uint32_t sq_entries;
  uint32_t cq_entries;
  uint32_t flags;
  uint32_t sq_thread_cpu;
  uint32_t sq_thread_idle;
  uint32_t features;
  uint32_t resv[4];
  struct io_sqring_offsets sq_off;
  struct io_cqring_offsets cq_off;
};

#define IORING_OFF_SQ_RING 0
#define IORING_OFF_SQES 0x10000000ULL
#define IORING_SETUP_SQE128 (1U << 10)
#define IORING_SETUP_CQE32 (1U << 11)
#define IORING_SETUP_NO_SQARRAY (1U << 16)

static struct io_uring_params* io_uring_p;

static long syz_io_uring_complete(volatile long a0, volatile long a1)
{
  // syzlang: syz_io_uring_complete(ring_ptr ring_ptr)
  // C:       syz_io_uring_complete(char* ring_ptr)
  // It is not checked if the ring is empty
  // Cast to original
  long ring_ptr = a0;
  int* result_fd = (int*)a1;
  // result_fd = (int*)mmap(0, sizeof(int) * (io_uring_p->cq_entries), PROT_READ
  // | PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0); result_fd_cnt = (int*)mmap(0,
  // sizeof(int), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
  int cnt = 0;
  // head read once
  unsigned cq_head_raw = *(unsigned*)(io_uring_p->cq_off.head + ring_ptr);
  unsigned cq_ring_mask = *(unsigned*)(io_uring_p->cq_off.ring_mask + ring_ptr);
  do {

    read_barrier();
    // tail
    unsigned cq_tail_raw = *(unsigned*)(io_uring_p->cq_off.tail + ring_ptr);
    if (cq_head_raw == cq_tail_raw) {
      break;
    }
    // head != tail, retrieve cq from head
    unsigned cq_head = cq_head_raw & cq_ring_mask;
    struct io_uring_cqe* cqe;
    if ((io_uring_p->flags & IORING_SETUP_CQE32) == 0)
      cqe = (struct io_uring_cqe*)(io_uring_p->cq_off.cqes + ring_ptr +
                                   SIZEOF_IO_URING_CQE * cq_head);
    else
      cqe = (struct io_uring_cqe*)(io_uring_p->cq_off.cqes + ring_ptr +
                                   SIZEOF_IO_URING_CQE * 2 * cq_head);
    // In the descriptions (sys/linux/io_uring.txt), openat and openat2 are
    // passed with a unique range of sqe.user_data (0x12345 and 0x23456) to
    // identify the operations which produces an fd instance. Check
    // cqe.user_data, which should be the same as sqe.user_data for that
    // operation. If it falls in that unique range, return cqe.res as fd.
    // Otherwise, just return an invalid fd.
    if (cqe->user_data == 0x12345 || cqe->user_data == 0x23456)
      result_fd[cnt++] = cqe->res;
    cq_head_raw += 1;
  } while (1);
  *(unsigned*)(io_uring_p->cq_off.head + ring_ptr) = cq_head_raw;
  write_barrier();
  if (cnt == 0) {
    return -1;
  }
  return 0;
}

// Wrapper for io_uring_setup and the subsequent mmap calls that map the ring
// and the sqes
static long syz_io_uring_setup(volatile long a0, volatile long a1,
                               volatile long a2, volatile long a3)
{
  // syzlang: syz_io_uring_setup(entries int32[1:IORING_MAX_ENTRIES], params
  // ptr[inout, io_uring_params], ring_ptr ptr[out, ring_ptr], sqes_ptr ptr[out,
  // sqes_ptr]) fd_io_uring C:       syz_io_uring_setup(uint32_t entries, struct
  // io_uring_params* params, void** ring_ptr_out, void** sqes_ptr_out) //
  // returns uint32_t fd_io_uring Cast to original
  uint32_t entries = (uint32_t)a0;
  struct io_uring_params* setup_params = (struct io_uring_params*)a1;
  void** ring_ptr_out = (void**)a2;
  void** sqes_ptr_out = (void**)a3;
  // Temporarily disable IORING_SETUP_CQE32 and IORING_SETUP_SQE128 that may
  // change SIZEOF_IO_URING_CQE and SIZEOF_IO_URING_SQE. Tracking bug:
  // https://github.com/google/syzkaller/issues/4531.
  setup_params->flags &= ~(IORING_SETUP_CQE32 | IORING_SETUP_SQE128);
  uint32_t fd_io_uring = syscall(__NR_io_uring_setup, entries, setup_params);
  io_uring_p = setup_params;
  // Compute the ring sizes
  uint32_t sq_ring_sz =
      setup_params->sq_off.array + setup_params->sq_entries * sizeof(uint32_t);
  uint32_t cq_ring_sz = setup_params->cq_off.cqes +
                        setup_params->cq_entries * SIZEOF_IO_URING_CQE;
  // Asssumed IORING_FEAT_SINGLE_MMAP, which is always the case with the current
  // implementation The implication is that the sq_ring_ptr and the cq_ring_ptr
  // are the same but the difference is in the offsets to access the fields of
  // these rings.
  uint32_t ring_sz = sq_ring_sz > cq_ring_sz ? sq_ring_sz : cq_ring_sz;
  *ring_ptr_out =
      mmap(0, ring_sz, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
           fd_io_uring, IORING_OFF_SQ_RING);
  uint32_t sqes_sz = setup_params->sq_entries * SIZEOF_IO_URING_SQE;
  *sqes_ptr_out = mmap(0, sqes_sz, PROT_READ | PROT_WRITE,
                       MAP_SHARED | MAP_POPULATE, fd_io_uring, IORING_OFF_SQES);
  return fd_io_uring;
}

static long syz_io_uring_submit(volatile long a0, volatile long a1,
                                volatile long a2)
{
  // syzlang: syz_io_uring_submit(ring_ptr ring_ptr, sqes_ptr sqes_ptr,
  // sqe ptr[in, io_uring_sqe]) C:       syz_io_uring_submit(char* ring_ptr,
  // io_uring_sqe* sqes_ptr,    io_uring_sqe* sqe) It is not checked if the ring
  // is full Cast to original
  long ring_ptr = a0; // This will be exposed to offsets in bytes
  char* sqes_ptr = (char*)a1;
  char* sqe = (char*)a2;
  unsigned sq_tail_raw = *(unsigned*)(io_uring_p->sq_off.tail + ring_ptr);
  unsigned sq_ring_mask = *(unsigned*)(io_uring_p->sq_off.ring_mask + ring_ptr);
  unsigned sq_tail = sq_tail_raw & sq_ring_mask;
  // write to current sq tail sqe
  void* sqe_dest;
  if ((io_uring_p->flags & IORING_SETUP_SQE128) == 0)
    sqe_dest = (void*)(sqes_ptr + SIZEOF_IO_URING_SQE * sq_tail);
  else {
    sqe_dest = (void*)(sqes_ptr + SIZEOF_IO_URING_SQE * sq_tail * 2);
  }

  // Write the sqe entry to its destination in sqes
  memcpy(sqe_dest, sqe, SIZEOF_IO_URING_SQE);
  // Advance the tail. Tail is a free-flowing integer and relies on natural
  // wrapping. Ensure that the kernel will never see a tail update without the
  // preceeding SQE stores being done.
  __atomic_store_n((unsigned*)(io_uring_p->sq_off.tail + ring_ptr),
                   sq_tail_raw + 1, __ATOMIC_RELEASE);
  // update sq array
  if ((io_uring_p->flags & IORING_SETUP_NO_SQARRAY) == 0)
    __atomic_store_n((unsigned*)(io_uring_p->sq_off.array + ring_ptr) + sq_tail,
                     sq_tail, __ATOMIC_RELEASE);
  // Now the application is free to call io_uring_enter() to submit the sqe
  return 0;
}

static void kill_and_wait(int pid, int* status)
{
  kill(-pid, SIGKILL);
  kill(pid, SIGKILL);
  for (int i = 0; i < 100; i++) {
    if (waitpid(-1, status, WNOHANG | __WALL) == pid)
      return;
    usleep(1000);
  }
  DIR* dir = opendir("/sys/fs/fuse/connections");
  if (dir) {
    for (;;) {
      struct dirent* ent = readdir(dir);
      if (!ent)
        break;
      if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0)
        continue;
      char abort[300];
      snprintf(abort, sizeof(abort), "/sys/fs/fuse/connections/%s/abort",
               ent->d_name);
      int fd = open(abort, O_WRONLY);
      if (fd == -1) {
        continue;
      }
      if (write(fd, abort, 1) < 0) {
      }
      close(fd);
    }
    closedir(dir);
  } else {
  }
  while (waitpid(-1, status, __WALL) != pid) {
  }
}

static void setup_test()
{
  prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0);
  setpgrp();
  write_file("/proc/self/oom_score_adj", "1000");
}

static void execute_one(void);

#define WAIT_FLAGS __WALL

static void loop(void)
{
  int iter = 0;
  for (;; iter++) {
    int pid = fork();
    if (pid < 0)
      exit(1);
    if (pid == 0) {
      setup_test();
      execute_one();
      exit(0);
    }
    int status = 0;
    uint64_t start = current_time_ms();
    for (;;) {
      if (waitpid(-1, &status, WNOHANG | WAIT_FLAGS) == pid)
        break;
      sleep_ms(1);
      if (current_time_ms() - start < 5000)
        continue;
      kill_and_wait(pid, &status);
      break;
    }
  }
}

uint64_t r[4] = {0x0, 0x0, 0x0, 0x0};

void execute_one(void)
{
  intptr_t res = 0;
  NONFAILING(*(uint32_t*)0x20000004 = 0x2e26);
  NONFAILING(*(uint32_t*)0x20000008 = 0x800);
  NONFAILING(*(uint32_t*)0x2000000c = 3);
  NONFAILING(*(uint32_t*)0x20000010 = 0x2cf);
  NONFAILING(*(uint32_t*)0x20000018 = 0);
  NONFAILING(memset((void*)0x2000001c, 0, 12));
  res = -1;
  NONFAILING(res = syz_io_uring_setup(/*entries=*/0x2299, /*params=*/0x20000000,
                                      /*ring_ptr=*/0x20000080,
                                      /*sqes_ptr=*/0x200000c0));
  if (res != -1) {
    r[0] = res;
    NONFAILING(r[1] = *(uint64_t*)0x20000080);
    NONFAILING(r[2] = *(uint64_t*)0x200000c0);
  }
  syscall(__NR_open, /*file=*/0ul,
          /*flags=__O_TMPFILE|O_SYNC|O_NONBLOCK|O_EXCL|O_DIRECTORY*/ 0x511880ul,
          /*mode=S_IWOTH|S_IXGRP*/ 0xaul);
  syscall(__NR_open, /*file=*/0ul,
          /*flags=O_DIRECTORY|O_DIRECT|O_CLOEXEC|O_APPEND*/ 0x94400ul,
          /*mode=S_IXGRP|S_IXUSR|S_IWUSR|S_IRUSR*/ 0x1c8ul);
  res = syscall(__NR_socket, /*domain=AF_UNIX*/ 1ul, /*type=SOCK_STREAM*/ 1ul,
                /*proto=*/0);
  if (res != -1)
    r[3] = res;
  syscall(__NR_epoll_create1, /*flags=*/0ul);
  syscall(__NR_eventfd2, /*initval=*/0x200, /*flags=*/0ul);
  syscall(__NR_io_uring_register, /*fd=*/r[0], /*opcode=*/0xful, /*arg=*/0ul,
          /*size=*/0ul);
  NONFAILING(
      syz_io_uring_submit(/*ring_ptr=*/r[1], /*sqes_ptr=*/r[2], /*sqe=*/0));
  syscall(__NR_io_uring_enter, /*fd=*/r[0], /*to_submit=*/1, /*min_complete=*/1,
          /*flags=IORING_ENTER_SQ_WAIT|IORING_ENTER_GETEVENTS*/ 5ul,
          /*sigmask=*/0ul, /*size=*/0ul);
  NONFAILING(syz_io_uring_complete(/*ring_ptr=*/r[1], /*result_fd=*/0));
  syscall(__NR_io_uring_register, /*fd=*/r[0], /*opcode=*/0x13ul, /*arg=*/0ul,
          /*nr_args=*/2ul);
  NONFAILING(
      syz_io_uring_submit(/*ring_ptr=*/r[1], /*sqes_ptr=*/r[2], /*sqe=*/0));
  NONFAILING(*(uint8_t*)0x20000540 = 6);
  NONFAILING(*(uint8_t*)0x20000541 = 0xc);
  NONFAILING(*(uint16_t*)0x20000542 = 0);
  NONFAILING(*(uint32_t*)0x20000544 = r[3]);
  NONFAILING(*(uint64_t*)0x20000548 = 0);
  NONFAILING(*(uint64_t*)0x20000550 = 0);
  NONFAILING(*(uint16_t*)0x20000558 = 1);
  NONFAILING(*(uint16_t*)0x2000055a = 0);
  NONFAILING(*(uint16_t*)0x2000055c = 0);
  NONFAILING(*(uint64_t*)0x20000560 = 1);
  NONFAILING(*(uint16_t*)0x20000568 = 0);
  NONFAILING(*(uint16_t*)0x2000056a = 0);
  NONFAILING(memset((void*)0x2000056c, 0, 4));
  NONFAILING(memset((void*)0x20000570, 0, 16));
  NONFAILING(syz_io_uring_submit(/*ring_ptr=*/r[1], /*sqes_ptr=*/r[2],
                                 /*sqe=*/0x20000540));
  syscall(__NR_io_uring_enter, /*fd=*/r[0], /*to_submit=*/2, /*min_complete=*/2,
          /*flags=IORING_ENTER_SQ_WAKEUP|IORING_ENTER_GETEVENTS*/ 3ul,
          /*sigmask=*/0ul, /*size=*/0ul);
  NONFAILING(syz_io_uring_complete(/*ring_ptr=*/r[1], /*result_fd=*/0));
}
int main(void)
{
  syscall(__NR_mmap, /*addr=*/0x1ffff000ul, /*len=*/0x1000ul, /*prot=*/0ul,
          /*flags=MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE*/ 0x32ul, /*fd=*/-1,
          /*offset=*/0ul);
  syscall(__NR_mmap, /*addr=*/0x20000000ul, /*len=*/0x1000000ul,
          /*prot=PROT_WRITE|PROT_READ|PROT_EXEC*/ 7ul,
          /*flags=MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE*/ 0x32ul, /*fd=*/-1,
          /*offset=*/0ul);
  syscall(__NR_mmap, /*addr=*/0x21000000ul, /*len=*/0x1000ul, /*prot=*/0ul,
          /*flags=MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE*/ 0x32ul, /*fd=*/-1,
          /*offset=*/0ul);
  install_segv_handler();
  loop();
  return 0;
}

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [io-uring] WARNING in io_fill_cqe_req_aux
  2024-06-07 17:07 [io-uring] WARNING in io_fill_cqe_req_aux chase xd
@ 2024-06-12  1:11 ` Pavel Begunkov
  2024-06-12  7:10   ` chase xd
  0 siblings, 1 reply; 5+ messages in thread
From: Pavel Begunkov @ 2024-06-12  1:11 UTC (permalink / raw)
  To: chase xd, axboe, io-uring, linux-kernel

On 6/7/24 18:07, chase xd wrote:
> Dear Linux kernel maintainers,
> 
> Syzkaller reports this previously unknown bug on Linux
> 6.8.0-rc3-00043-ga69d20885494-dirty #4. Seems like the bug was
> silently or unintendedly fixed in the latest version.

That branch you're using is confusing, apart from being
dirty and rc3, apparently it has never been merged. The
patch the test fails on looks different upstream:


commit 902ce82c2aa130bea5e3feca2d4ae62781865da7
Author: Pavel Begunkov <[email protected]>
Date:   Mon Mar 18 22:00:32 2024 +0000

     io_uring: get rid of intermediate aux cqe caches


It reproduces with your version but not with anything
upstream


> ```
> Syzkaller hit 'WARNING in io_fill_cqe_req_aux' bug.
> 
> ------------[ cut here ]------------
> WARNING: CPU: 7 PID: 8369 at io_uring/io_uring.h:132
> io_lockdep_assert_cq_locked+0x2c7/0x340 io_uring/io_uring.h:132
> Modules linked in:
> CPU: 7 PID: 8369 Comm: syz-executor263 Not tainted
> 6.8.0-rc3-00043-ga69d20885494-dirty #4
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014
> RIP: 0010:io_lockdep_assert_cq_locked+0x2c7/0x340 io_uring/io_uring.h:132
> Code: 48 8d bb 98 03 00 00 be ff ff ff ff e8 52 45 4b 06 31 ff 89 c3
> 89 c6 e8 b7 e2 2d fd 85 db 0f 85 d5 fe ff ff e8 0a e7 2d fd 90 <0f> 0b
> 90 e9 c7 fe ff ff e8 fc e6 2d fd e8 c7 38 fa fc 48 85 c0 0f
> RSP: 0018:ffffc90012af79a8 EFLAGS: 00010293
> RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffffff845cf059
> RDX: ffff8880252ea440 RSI: ffffffff845cf066 RDI: 0000000000000005
> RBP: 0000000000000000 R08: 0000000000000005 R09: 0000000000000000
> R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000001
> R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000001
> FS:  00005555570e13c0(0000) GS:ffff88823bd80000(0000) knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 00007f1bdbcae020 CR3: 0000000022624000 CR4: 0000000000750ef0
> PKRU: 55555554
> Call Trace:
>   <TASK>
>   io_fill_cqe_req_aux+0xd6/0x1f0 io_uring/io_uring.c:925
>   io_poll_check_events io_uring/poll.c:325 [inline]
>   io_poll_task_func+0x16f/0x1000 io_uring/poll.c:357
>   io_handle_tw_list+0x172/0x560 io_uring/io_uring.c:1154
>   tctx_task_work_run+0xaa/0x330 io_uring/io_uring.c:1226
>   tctx_task_work+0x7b/0xd0 io_uring/io_uring.c:1244
>   task_work_run+0x16d/0x260 kernel/task_work.c:180
>   get_signal+0x1cb/0x25a0 kernel/signal.c:2669
>   arch_do_signal_or_restart+0x81/0x7e0 arch/x86/kernel/signal.c:310
>   exit_to_user_mode_loop kernel/entry/common.c:105 [inline]
>   exit_to_user_mode_prepare include/linux/entry-common.h:328 [inline]
>   __syscall_exit_to_user_mode_work kernel/entry/common.c:201 [inline]
>   syscall_exit_to_user_mode+0x156/0x2b0 kernel/entry/common.c:212
>   do_syscall_64+0xe5/0x270 arch/x86/entry/common.c:89
>   entry_SYSCALL_64_after_hwframe+0x6f/0x77
> RIP: 0033:0x7f1bdbc2d88d
> Code: c3 e8 a7 1f 00 00 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 89 f8 48
> 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d
> 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48
> RSP: 002b:00007ffd12f6fa18 EFLAGS: 00000246 ORIG_RAX: 00000000000001aa
> RAX: 0000000000000001 RBX: 000000000000220b RCX: 00007f1bdbc2d88d
> RDX: 0000000000000000 RSI: 0000000000005012 RDI: 0000000000000003
> RBP: 0000000000000003 R08: 0000000000000000 R09: 0000000000000000
> R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000001
> R13: 431bde82d7b634db R14: 00007f1bdbcaa4f0 R15: 0000000000000001
>   </TASK>
> 
> 
> Syzkaller reproducer:
> # {Threaded:false Repeat:true RepeatTimes:0 Procs:1 Slowdown:1
> Sandbox: SandboxArg:0 Leak:false NetInjection:false NetDevices:false
> NetReset:false Cgroups:false BinfmtMisc:false CloseFDs:false
> KCSAN:false DevlinkPCI:false NicVF:false USB:false VhciInjection:false
> Wifi:false IEEE802154:false Sysctl:false Swap:false UseTmpDir:false
> HandleSegv:false Repro:false Trace:false LegacyOptions:{Collide:false
> Fault:false FaultCall:0 FaultNth:0}}
> r0 = syz_io_uring_setup(0x220b, &(0x7f0000000000)={0x0, 0x63db,
> 0x10000, 0x800}, &(0x7f0000000080)=<r1=>0x0,
> &(0x7f0000000200)=<r2=>0x0)
> r3 = socket$inet(0x2, 0x1, 0x0)
> syz_io_uring_submit(r1, r2,
> &(0x7f0000000a80)=@IORING_OP_POLL_ADD={0x6, 0x0, 0x0, @fd=r3, 0x0,
> 0x0, 0x1})
> io_uring_enter(r0, 0x5012, 0x0, 0x0, 0x0, 0x0)
> ```
> 
> crepro is in the attachment.
> 
> Best Regards
> Xdchase

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [io-uring] WARNING in io_fill_cqe_req_aux
  2024-06-12  1:11 ` Pavel Begunkov
@ 2024-06-12  7:10   ` chase xd
  2024-06-12 12:35     ` Pavel Begunkov
  0 siblings, 1 reply; 5+ messages in thread
From: chase xd @ 2024-06-12  7:10 UTC (permalink / raw)
  To: Pavel Begunkov, Jens Axboe, io-uring, linux-kernel

Sorry now I'm also a bit confused by the branch choosing. I checked
out branch "for-6.9/io_uring" and started testing on that branch. I
assume that was the latest version of io_uring at that time, even now
I check out that branch and the bug still exists. How should I know
whether the branch will be merged, and which branch do you think I
should test on? Thanks.

Pavel Begunkov <[email protected]> 于2024年6月12日周三 03:11写道:
>
> On 6/7/24 18:07, chase xd wrote:
> > Dear Linux kernel maintainers,
> >
> > Syzkaller reports this previously unknown bug on Linux
> > 6.8.0-rc3-00043-ga69d20885494-dirty #4. Seems like the bug was
> > silently or unintendedly fixed in the latest version.
>
> That branch you're using is confusing, apart from being
> dirty and rc3, apparently it has never been merged. The
> patch the test fails on looks different upstream:
>
>
> commit 902ce82c2aa130bea5e3feca2d4ae62781865da7
> Author: Pavel Begunkov <[email protected]>
> Date:   Mon Mar 18 22:00:32 2024 +0000
>
>      io_uring: get rid of intermediate aux cqe caches
>
>
> It reproduces with your version but not with anything
> upstream
>
>
> > ```
> > Syzkaller hit 'WARNING in io_fill_cqe_req_aux' bug.
> >
> > ------------[ cut here ]------------
> > WARNING: CPU: 7 PID: 8369 at io_uring/io_uring.h:132
> > io_lockdep_assert_cq_locked+0x2c7/0x340 io_uring/io_uring.h:132
> > Modules linked in:
> > CPU: 7 PID: 8369 Comm: syz-executor263 Not tainted
> > 6.8.0-rc3-00043-ga69d20885494-dirty #4
> > Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014
> > RIP: 0010:io_lockdep_assert_cq_locked+0x2c7/0x340 io_uring/io_uring.h:132
> > Code: 48 8d bb 98 03 00 00 be ff ff ff ff e8 52 45 4b 06 31 ff 89 c3
> > 89 c6 e8 b7 e2 2d fd 85 db 0f 85 d5 fe ff ff e8 0a e7 2d fd 90 <0f> 0b
> > 90 e9 c7 fe ff ff e8 fc e6 2d fd e8 c7 38 fa fc 48 85 c0 0f
> > RSP: 0018:ffffc90012af79a8 EFLAGS: 00010293
> > RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffffff845cf059
> > RDX: ffff8880252ea440 RSI: ffffffff845cf066 RDI: 0000000000000005
> > RBP: 0000000000000000 R08: 0000000000000005 R09: 0000000000000000
> > R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000001
> > R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000001
> > FS:  00005555570e13c0(0000) GS:ffff88823bd80000(0000) knlGS:0000000000000000
> > CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > CR2: 00007f1bdbcae020 CR3: 0000000022624000 CR4: 0000000000750ef0
> > PKRU: 55555554
> > Call Trace:
> >   <TASK>
> >   io_fill_cqe_req_aux+0xd6/0x1f0 io_uring/io_uring.c:925
> >   io_poll_check_events io_uring/poll.c:325 [inline]
> >   io_poll_task_func+0x16f/0x1000 io_uring/poll.c:357
> >   io_handle_tw_list+0x172/0x560 io_uring/io_uring.c:1154
> >   tctx_task_work_run+0xaa/0x330 io_uring/io_uring.c:1226
> >   tctx_task_work+0x7b/0xd0 io_uring/io_uring.c:1244
> >   task_work_run+0x16d/0x260 kernel/task_work.c:180
> >   get_signal+0x1cb/0x25a0 kernel/signal.c:2669
> >   arch_do_signal_or_restart+0x81/0x7e0 arch/x86/kernel/signal.c:310
> >   exit_to_user_mode_loop kernel/entry/common.c:105 [inline]
> >   exit_to_user_mode_prepare include/linux/entry-common.h:328 [inline]
> >   __syscall_exit_to_user_mode_work kernel/entry/common.c:201 [inline]
> >   syscall_exit_to_user_mode+0x156/0x2b0 kernel/entry/common.c:212
> >   do_syscall_64+0xe5/0x270 arch/x86/entry/common.c:89
> >   entry_SYSCALL_64_after_hwframe+0x6f/0x77
> > RIP: 0033:0x7f1bdbc2d88d
> > Code: c3 e8 a7 1f 00 00 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 89 f8 48
> > 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d
> > 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48
> > RSP: 002b:00007ffd12f6fa18 EFLAGS: 00000246 ORIG_RAX: 00000000000001aa
> > RAX: 0000000000000001 RBX: 000000000000220b RCX: 00007f1bdbc2d88d
> > RDX: 0000000000000000 RSI: 0000000000005012 RDI: 0000000000000003
> > RBP: 0000000000000003 R08: 0000000000000000 R09: 0000000000000000
> > R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000001
> > R13: 431bde82d7b634db R14: 00007f1bdbcaa4f0 R15: 0000000000000001
> >   </TASK>
> >
> >
> > Syzkaller reproducer:
> > # {Threaded:false Repeat:true RepeatTimes:0 Procs:1 Slowdown:1
> > Sandbox: SandboxArg:0 Leak:false NetInjection:false NetDevices:false
> > NetReset:false Cgroups:false BinfmtMisc:false CloseFDs:false
> > KCSAN:false DevlinkPCI:false NicVF:false USB:false VhciInjection:false
> > Wifi:false IEEE802154:false Sysctl:false Swap:false UseTmpDir:false
> > HandleSegv:false Repro:false Trace:false LegacyOptions:{Collide:false
> > Fault:false FaultCall:0 FaultNth:0}}
> > r0 = syz_io_uring_setup(0x220b, &(0x7f0000000000)={0x0, 0x63db,
> > 0x10000, 0x800}, &(0x7f0000000080)=<r1=>0x0,
> > &(0x7f0000000200)=<r2=>0x0)
> > r3 = socket$inet(0x2, 0x1, 0x0)
> > syz_io_uring_submit(r1, r2,
> > &(0x7f0000000a80)=@IORING_OP_POLL_ADD={0x6, 0x0, 0x0, @fd=r3, 0x0,
> > 0x0, 0x1})
> > io_uring_enter(r0, 0x5012, 0x0, 0x0, 0x0, 0x0)
> > ```
> >
> > crepro is in the attachment.
> >
> > Best Regards
> > Xdchase
>
> --
> Pavel Begunkov

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [io-uring] WARNING in io_fill_cqe_req_aux
  2024-06-12  7:10   ` chase xd
@ 2024-06-12 12:35     ` Pavel Begunkov
  2024-06-12 13:52       ` Pavel Begunkov
  0 siblings, 1 reply; 5+ messages in thread
From: Pavel Begunkov @ 2024-06-12 12:35 UTC (permalink / raw)
  To: chase xd, Jens Axboe, io-uring, linux-kernel

On 6/12/24 08:10, chase xd wrote:
> Sorry now I'm also a bit confused by the branch choosing. I checked
> out branch "for-6.9/io_uring" and started testing on that branch. I
> assume that was the latest version of io_uring at that time, even now
> I check out that branch and the bug still exists. How should I know
> whether the branch will be merged, and which branch do you think I
> should test on? Thanks.

# git show a69d20885494:io_uring/io_uring.c | grep -A 13 io_fill_cqe_req_aux
bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags)
{
         struct io_ring_ctx *ctx = req->ctx;
         u64 user_data = req->cqe.user_data;

         if (!defer)
                 return __io_post_aux_cqe(ctx, user_data, res, cflags, false);

         lockdep_assert_held(&ctx->uring_lock);
         io_lockdep_assert_cq_locked(ctx);

         ctx->submit_state.flush_cqes = true;
         return io_fill_cqe_aux(ctx, user_data, res, cflags);
}

That's the buggy version from the hash you're testing, IIRC it
was in the tree for longer than necessary, presumably which is
why you found it, but it was never sent to Linus. Below is
current state of for-6.9 and what it was replaced with
respectively. Let me separately check for-6.9/io_uring if you're
concerned about it.




# git show for-6.9/io_uring:io_uring/io_uring.c | grep -A 30 io_fill_cqe_req_aux
bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags)
{
         struct io_ring_ctx *ctx = req->ctx;
         u64 user_data = req->cqe.user_data;
         struct io_uring_cqe *cqe;

         lockdep_assert(!io_wq_current_is_worker());

         if (!defer)
                 return __io_post_aux_cqe(ctx, user_data, res, cflags, false);

         lockdep_assert_held(&ctx->uring_lock);

         if (ctx->submit_state.cqes_count == ARRAY_SIZE(ctx->completion_cqes)) {
...

# git show origin/for-6.10/io_uring:io_uring/io_uring.c | grep -A 13 io_req_post_cqe
bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags)
{
         struct io_ring_ctx *ctx = req->ctx;
         bool posted;

         lockdep_assert(!io_wq_current_is_worker());
         lockdep_assert_held(&ctx->uring_lock);

         __io_cq_lock(ctx);
         posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags);
         ctx->submit_state.cq_flush = true;
         __io_cq_unlock_post(ctx);
         return posted;
}

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [io-uring] WARNING in io_fill_cqe_req_aux
  2024-06-12 12:35     ` Pavel Begunkov
@ 2024-06-12 13:52       ` Pavel Begunkov
  0 siblings, 0 replies; 5+ messages in thread
From: Pavel Begunkov @ 2024-06-12 13:52 UTC (permalink / raw)
  To: chase xd, Jens Axboe, io-uring, linux-kernel

On 6/12/24 13:35, Pavel Begunkov wrote:
> On 6/12/24 08:10, chase xd wrote:
>> Sorry now I'm also a bit confused by the branch choosing. I checked
>> out branch "for-6.9/io_uring" and started testing on that branch. I
>> assume that was the latest version of io_uring at that time, even now
>> I check out that branch and the bug still exists. How should I know
>> whether the branch will be merged, and which branch do you think I
>> should test on? Thanks.
> 
> # git show a69d20885494:io_uring/io_uring.c | grep -A 13 io_fill_cqe_req_aux
> bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags)
> {
>          struct io_ring_ctx *ctx = req->ctx;
>          u64 user_data = req->cqe.user_data;
> 
>          if (!defer)
>                  return __io_post_aux_cqe(ctx, user_data, res, cflags, false);
> 
>          lockdep_assert_held(&ctx->uring_lock);
>          io_lockdep_assert_cq_locked(ctx);
> 
>          ctx->submit_state.flush_cqes = true;
>          return io_fill_cqe_aux(ctx, user_data, res, cflags);
> }
> 
> That's the buggy version from the hash you're testing, IIRC it
> was in the tree for longer than necessary, presumably which is
> why you found it, but it was never sent to Linus. Below is
> current state of for-6.9 and what it was replaced with
> respectively. Let me separately check for-6.9/io_uring if you're
> concerned about it.

In other words, it happens that bugs appear in the branches
but get rooted out before it gets anywhere. The main confusion
is that the version you're looking at was fixed up back somewhere
in March. That's fine, I'd just recommend fetch the repo and
update your base.

I can't hit the problem with for-6.9/io_uring, which make sense
because it's lacking the patch I'd blame it to. I'm confused
how you see it there.


> # git show for-6.9/io_uring:io_uring/io_uring.c | grep -A 30 io_fill_cqe_req_aux
> bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags)
> {
>          struct io_ring_ctx *ctx = req->ctx;
>          u64 user_data = req->cqe.user_data;
>          struct io_uring_cqe *cqe;
> 
>          lockdep_assert(!io_wq_current_is_worker());
> 
>          if (!defer)
>                  return __io_post_aux_cqe(ctx, user_data, res, cflags, false);
> 
>          lockdep_assert_held(&ctx->uring_lock);
> 
>          if (ctx->submit_state.cqes_count == ARRAY_SIZE(ctx->completion_cqes)) {
> ...
> 
> # git show origin/for-6.10/io_uring:io_uring/io_uring.c | grep -A 13 io_req_post_cqe
> bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags)
> {
>          struct io_ring_ctx *ctx = req->ctx;
>          bool posted;
> 
>          lockdep_assert(!io_wq_current_is_worker());
>          lockdep_assert_held(&ctx->uring_lock);
> 
>          __io_cq_lock(ctx);
>          posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags);
>          ctx->submit_state.cq_flush = true;
>          __io_cq_unlock_post(ctx);
>          return posted;
> }
> 

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2024-06-12 13:52 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-06-07 17:07 [io-uring] WARNING in io_fill_cqe_req_aux chase xd
2024-06-12  1:11 ` Pavel Begunkov
2024-06-12  7:10   ` chase xd
2024-06-12 12:35     ` Pavel Begunkov
2024-06-12 13:52       ` Pavel Begunkov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox