* [PATCH] io_uring: make overflowing cqe subject to OOM
@ 2025-12-29 20:19 Alexandre Negrel
2025-12-30 0:23 ` Jens Axboe
2025-12-30 7:15 ` [syzbot ci] " syzbot ci
0 siblings, 2 replies; 5+ messages in thread
From: Alexandre Negrel @ 2025-12-29 20:19 UTC (permalink / raw)
To: axboe, io-uring, linux-kernel; +Cc: Alexandre Negrel
Overflowing CQE are now allocated with GFP_KERNEL instead of GFP_ATOMIC.
OOM killer is triggered on overflow and is not possible to exceed cgroup
memory limits anymore.
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220794
Signed-off-by: Alexandre Negrel <alexandre@negrel.dev>
---
io_uring/io_uring.c | 34 +++++++++-------------------------
1 file changed, 9 insertions(+), 25 deletions(-)
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 6cb24cdf8e68..5ff1a13fed1c 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -545,31 +545,12 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
io_eventfd_signal(ctx, true);
}
-static inline void __io_cq_lock(struct io_ring_ctx *ctx)
-{
- if (!ctx->lockless_cq)
- spin_lock(&ctx->completion_lock);
-}
-
static inline void io_cq_lock(struct io_ring_ctx *ctx)
__acquires(ctx->completion_lock)
{
spin_lock(&ctx->completion_lock);
}
-static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx)
-{
- io_commit_cqring(ctx);
- if (!ctx->task_complete) {
- if (!ctx->lockless_cq)
- spin_unlock(&ctx->completion_lock);
- /* IOPOLL rings only need to wake up if it's also SQPOLL */
- if (!ctx->syscall_iopoll)
- io_cqring_wake(ctx);
- }
- io_commit_cqring_flush(ctx);
-}
-
static void io_cq_unlock_post(struct io_ring_ctx *ctx)
__releases(ctx->completion_lock)
{
@@ -1513,7 +1494,6 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
struct io_submit_state *state = &ctx->submit_state;
struct io_wq_work_node *node;
- __io_cq_lock(ctx);
__wq_list_for_each(node, &state->compl_reqs) {
struct io_kiocb *req = container_of(node, struct io_kiocb,
comp_list);
@@ -1525,13 +1505,17 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
*/
if (!(req->flags & (REQ_F_CQE_SKIP | REQ_F_REISSUE)) &&
unlikely(!io_fill_cqe_req(ctx, req))) {
- if (ctx->lockless_cq)
- io_cqe_overflow(ctx, &req->cqe, &req->big_cqe);
- else
- io_cqe_overflow_locked(ctx, &req->cqe, &req->big_cqe);
+ io_cqe_overflow(ctx, &req->cqe, &req->big_cqe);
}
}
- __io_cq_unlock_post(ctx);
+
+ io_commit_cqring(ctx);
+ if (!ctx->task_complete) {
+ /* IOPOLL rings only need to wake up if it's also SQPOLL */
+ if (!ctx->syscall_iopoll)
+ io_cqring_wake(ctx);
+ }
+ io_commit_cqring_flush(ctx);
if (!wq_list_empty(&state->compl_reqs)) {
io_free_batch_list(ctx, state->compl_reqs.first);
--
2.51.0
^ permalink raw reply related [flat|nested] 5+ messages in thread
* Re: [PATCH] io_uring: make overflowing cqe subject to OOM
2025-12-29 20:19 [PATCH] io_uring: make overflowing cqe subject to OOM Alexandre Negrel
@ 2025-12-30 0:23 ` Jens Axboe
2025-12-30 14:50 ` Alexandre Negrel
2025-12-30 7:15 ` [syzbot ci] " syzbot ci
1 sibling, 1 reply; 5+ messages in thread
From: Jens Axboe @ 2025-12-30 0:23 UTC (permalink / raw)
To: Alexandre Negrel, io-uring, linux-kernel
On 12/29/25 1:19 PM, Alexandre Negrel wrote:
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index 6cb24cdf8e68..5ff1a13fed1c 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -545,31 +545,12 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
> io_eventfd_signal(ctx, true);
> }
>
> -static inline void __io_cq_lock(struct io_ring_ctx *ctx)
> -{
> - if (!ctx->lockless_cq)
> - spin_lock(&ctx->completion_lock);
> -}
> -
> static inline void io_cq_lock(struct io_ring_ctx *ctx)
> __acquires(ctx->completion_lock)
> {
> spin_lock(&ctx->completion_lock);
> }
>
> -static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx)
> -{
> - io_commit_cqring(ctx);
> - if (!ctx->task_complete) {
> - if (!ctx->lockless_cq)
> - spin_unlock(&ctx->completion_lock);
> - /* IOPOLL rings only need to wake up if it's also SQPOLL */
> - if (!ctx->syscall_iopoll)
> - io_cqring_wake(ctx);
> - }
> - io_commit_cqring_flush(ctx);
> -}
> -
> static void io_cq_unlock_post(struct io_ring_ctx *ctx)
> __releases(ctx->completion_lock)
> {
> @@ -1513,7 +1494,6 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
> struct io_submit_state *state = &ctx->submit_state;
> struct io_wq_work_node *node;
>
> - __io_cq_lock(ctx);
> __wq_list_for_each(node, &state->compl_reqs) {
> struct io_kiocb *req = container_of(node, struct io_kiocb,
> comp_list);
> @@ -1525,13 +1505,17 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
> */
> if (!(req->flags & (REQ_F_CQE_SKIP | REQ_F_REISSUE)) &&
> unlikely(!io_fill_cqe_req(ctx, req))) {
> - if (ctx->lockless_cq)
> - io_cqe_overflow(ctx, &req->cqe, &req->big_cqe);
> - else
> - io_cqe_overflow_locked(ctx, &req->cqe, &req->big_cqe);
> + io_cqe_overflow(ctx, &req->cqe, &req->big_cqe);
> }
> }
> - __io_cq_unlock_post(ctx);
> +
> + io_commit_cqring(ctx);
> + if (!ctx->task_complete) {
> + /* IOPOLL rings only need to wake up if it's also SQPOLL */
> + if (!ctx->syscall_iopoll)
> + io_cqring_wake(ctx);
> + }
> + io_commit_cqring_flush(ctx);
>
> if (!wq_list_empty(&state->compl_reqs)) {
> io_free_batch_list(ctx, state->compl_reqs.first);
You seem to just remove the lock around posting CQEs, and hence then it
can use GFP_KERNEL? That's very broken... I'm assuming the issue here is
that memcg will look at __GFP_HIGH somehow and allow it to proceed?
Surely that should not stop OOM, just defer it?
In any case, then below should then do the same. Can you test?
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 6cb24cdf8e68..709943fedaf4 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -864,7 +864,7 @@ static __cold bool io_cqe_overflow_locked(struct io_ring_ctx *ctx,
{
struct io_overflow_cqe *ocqe;
- ocqe = io_alloc_ocqe(ctx, cqe, big_cqe, GFP_ATOMIC);
+ ocqe = io_alloc_ocqe(ctx, cqe, big_cqe, GFP_NOWAIT);
return io_cqring_add_overflow(ctx, ocqe);
}
--
Jens Axboe
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [syzbot ci] Re: io_uring: make overflowing cqe subject to OOM
2025-12-29 20:19 [PATCH] io_uring: make overflowing cqe subject to OOM Alexandre Negrel
2025-12-30 0:23 ` Jens Axboe
@ 2025-12-30 7:15 ` syzbot ci
1 sibling, 0 replies; 5+ messages in thread
From: syzbot ci @ 2025-12-30 7:15 UTC (permalink / raw)
To: alexandre, axboe, io-uring, linux-kernel; +Cc: syzbot, syzkaller-bugs
syzbot ci has tested the following series
[v1] io_uring: make overflowing cqe subject to OOM
https://lore.kernel.org/all/20251229201933.515797-1-alexandre@negrel.dev
* [PATCH] io_uring: make overflowing cqe subject to OOM
and found the following issue:
WARNING in io_get_cqe_overflow
Full report is available here:
https://ci.syzbot.org/series/d71ff874-04e1-4cc7-b7c0-e48eb77bc984
***
WARNING in io_get_cqe_overflow
tree: torvalds
URL: https://kernel.googlesource.com/pub/scm/linux/kernel/git/torvalds/linux
base: f8f9c1f4d0c7a64600e2ca312dec824a0bc2f1da
arch: amd64
compiler: Debian clang version 21.1.8 (++20251202083448+f68f64eb8130-1~exp1~20251202083504.46), Debian LLD 21.1.8
config: https://ci.syzbot.org/builds/c13ff885-4e87-4df2-9d49-a52f3d040dd6/config
C repro: https://ci.syzbot.org/findings/e3c6e33b-4e4e-45de-b5b7-4ff5fc363989/c_repro
syz repro: https://ci.syzbot.org/findings/e3c6e33b-4e4e-45de-b5b7-4ff5fc363989/syz_repro
------------[ cut here ]------------
WARNING: io_uring/io_uring.h:211 at io_lockdep_assert_cq_locked io_uring/io_uring.h:211 [inline], CPU#0: syz.0.17/5984
WARNING: io_uring/io_uring.h:211 at io_get_cqe_overflow+0x599/0x730 io_uring/io_uring.h:249, CPU#0: syz.0.17/5984
Modules linked in:
CPU: 0 UID: 0 PID: 5984 Comm: syz.0.17 Not tainted syzkaller #0 PREEMPT(full)
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
RIP: 0010:io_lockdep_assert_cq_locked io_uring/io_uring.h:211 [inline]
RIP: 0010:io_get_cqe_overflow+0x599/0x730 io_uring/io_uring.h:249
Code: 0f 0b 90 e9 0e fb ff ff e8 c4 29 35 fd 90 0f 0b 90 e9 91 fb ff ff e8 b6 29 35 fd 90 0f 0b 90 e9 6a fd ff ff e8 a8 29 35 fd 90 <0f> 0b 90 e9 5c fd ff ff e8 9a 29 35 fd 90 0f 0b 90 e9 4a fd ff ff
RSP: 0018:ffffc90003cf7958 EFLAGS: 00010293
RAX: ffffffff848d4748 RBX: 0000000000000000 RCX: ffff888110c8ba80
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
RBP: 0000000000000000 R08: ffff888110c8ba80 R09: 0000000000000002
R10: 00000000fffffdef R11: 0000000000000000 R12: 0000000000000000
R13: 1ffff11022544c00 R14: ffff888112a26000 R15: dffffc0000000000
FS: 00005555940b9500(0000) GS:ffff88818e40f000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000001b30563fff CR3: 00000001731de000 CR4: 00000000000006f0
Call Trace:
<TASK>
io_get_cqe io_uring/io_uring.h:271 [inline]
io_fill_cqe_req io_uring/io_uring.h:293 [inline]
__io_submit_flush_completions+0x11c/0xe50 io_uring/io_uring.c:1507
io_submit_flush_completions io_uring/io_uring.h:239 [inline]
io_submit_state_end io_uring/io_uring.c:2318 [inline]
io_submit_sqes+0x1c26/0x2130 io_uring/io_uring.c:2433
__do_sys_io_uring_enter io_uring/io_uring.c:3264 [inline]
__se_sys_io_uring_enter+0x2f7/0x2c30 io_uring/io_uring.c:3203
do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
do_syscall_64+0xe2/0xf80 arch/x86/entry/syscall_64.c:94
entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7f00d579acb9
Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 e8 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007ffc380f6178 EFLAGS: 00000246 ORIG_RAX: 00000000000001aa
RAX: ffffffffffffffda RBX: 00007f00d5a05fa0 RCX: 00007f00d579acb9
RDX: 0000000000000002 RSI: 0000000000004d10 RDI: 0000000000000003
RBP: 00007f00d5808bf7 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000002 R11: 0000000000000246 R12: 0000000000000000
R13: 00007f00d5a05fac R14: 00007f00d5a05fa0 R15: 00007f00d5a05fa0
</TASK>
***
If these findings have caused you to resend the series or submit a
separate fix, please add the following tag to your commit message:
Tested-by: syzbot@syzkaller.appspotmail.com
---
This report is generated by a bot. It may contain errors.
syzbot ci engineers can be reached at syzkaller@googlegroups.com.
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH] io_uring: make overflowing cqe subject to OOM
2025-12-30 0:23 ` Jens Axboe
@ 2025-12-30 14:50 ` Alexandre Negrel
2025-12-30 16:01 ` Jens Axboe
0 siblings, 1 reply; 5+ messages in thread
From: Alexandre Negrel @ 2025-12-30 14:50 UTC (permalink / raw)
To: Jens Axboe; +Cc: io-uring, linux-kernel
On Tuesday, December 30th, 2025 at 1:23 AM, Jens Axboe <axboe@kernel.dk> wrote:
> On 12/29/25 1:19 PM, Alexandre Negrel wrote:
>
> > diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> > index 6cb24cdf8e68..5ff1a13fed1c 100644
> > --- a/io_uring/io_uring.c
> > +++ b/io_uring/io_uring.c
> > @@ -545,31 +545,12 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
> > io_eventfd_signal(ctx, true);
> > }
> >
> > -static inline void __io_cq_lock(struct io_ring_ctx *ctx)
> > -{
> > - if (!ctx->lockless_cq)
> > - spin_lock(&ctx->completion_lock);
> > -}
> > -
> > static inline void io_cq_lock(struct io_ring_ctx *ctx)
> > __acquires(ctx->completion_lock)
> > {
> > spin_lock(&ctx->completion_lock);
> > }
> >
> > -static inline void __io_cq_unlock_post(struct io_ring_ctx ctx)
> > -{
> > - io_commit_cqring(ctx);
> > - if (!ctx->task_complete) {
> > - if (!ctx->lockless_cq)
> > - spin_unlock(&ctx->completion_lock);
> > - / IOPOLL rings only need to wake up if it's also SQPOLL */
> > - if (!ctx->syscall_iopoll)
> > - io_cqring_wake(ctx);
> > - }
> > - io_commit_cqring_flush(ctx);
> > -}
> > -
> > static void io_cq_unlock_post(struct io_ring_ctx *ctx)
> > __releases(ctx->completion_lock)
> > {
> > @@ -1513,7 +1494,6 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
> > struct io_submit_state *state = &ctx->submit_state;
> > struct io_wq_work_node *node;
> >
> > - __io_cq_lock(ctx);
> > __wq_list_for_each(node, &state->compl_reqs) {
> > struct io_kiocb *req = container_of(node, struct io_kiocb,
> > comp_list);
> > @@ -1525,13 +1505,17 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
> > /
> > if (!(req->flags & (REQ_F_CQE_SKIP | REQ_F_REISSUE)) &&
> > unlikely(!io_fill_cqe_req(ctx, req))) {
> > - if (ctx->lockless_cq)
> > - io_cqe_overflow(ctx, &req->cqe, &req->big_cqe);
> > - else
> > - io_cqe_overflow_locked(ctx, &req->cqe, &req->big_cqe);
> > + io_cqe_overflow(ctx, &req->cqe, &req->big_cqe);
> > }
> > }
> > - __io_cq_unlock_post(ctx);
> > +
> > + io_commit_cqring(ctx);
> > + if (!ctx->task_complete) {
> > + / IOPOLL rings only need to wake up if it's also SQPOLL */
> > + if (!ctx->syscall_iopoll)
> > + io_cqring_wake(ctx);
> > + }
> > + io_commit_cqring_flush(ctx);
> >
> > if (!wq_list_empty(&state->compl_reqs)) {
> > io_free_batch_list(ctx, state->compl_reqs.first);
>
>
> You seem to just remove the lock around posting CQEs, and hence then it
> can use GFP_KERNEL? That's very broken... I'm assuming the issue here is
> that memcg will look at __GFP_HIGH somehow and allow it to proceed?
> Surely that should not stop OOM, just defer it?
>
> In any case, then below should then do the same. Can you test?
>
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index 6cb24cdf8e68..709943fedaf4 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -864,7 +864,7 @@ static __cold bool io_cqe_overflow_locked(struct io_ring_ctx *ctx,
> {
> struct io_overflow_cqe *ocqe;
>
> - ocqe = io_alloc_ocqe(ctx, cqe, big_cqe, GFP_ATOMIC);
> + ocqe = io_alloc_ocqe(ctx, cqe, big_cqe, GFP_NOWAIT);
> return io_cqring_add_overflow(ctx, ocqe);
> }
>
>
> --
> Jens Axboe
> You seem to just remove the lock around posting CQEs, and hence then it
> can use GFP_KERNEL? That's very broken...
This is my first time contributing to the linux kernel, sorry if my patch is
broken.
> I'm assuming the issue here is that memcg will look at __GFP_HIGH somehow and
> allow it to proceed?
Exactly, the allocation succeed even though it exceed cgroup limits. After
digging through try_charge_memcg(), it seems that OOM killer isn't involved
unless __GFP_DIRECT_RECLAIM bit is set (see gfpflags_allow_blocking).
https://github.com/torvalds/linux/blob/8640b74557fc8b4c300030f6ccb8cd078f665ec8/mm/memcontrol.c#L2329
https://github.com/torvalds/linux/blob/8640b74557fc8b4c300030f6ccb8cd078f665ec8/include/linux/gfp.h#L38
> In any case, then below should then do the same. Can you test?
I tried it and it seems to fix the issue but in a different way.
try_charge_memcg now returns -ENOMEM and the allocation failed. The completion
queue entry is "dropped on the floor" in io_cqring_add_overflow.
So I see 3 options here:
* use GFP_NOWAIT if dropping CQE is ok
* allocate using GFP_KERNEL_ACCOUNT without holding the lock then adding
overflowing entries while holding the completion_lock (iterating twice over
compl_reqs)
* charge memory after releasing the lock. I don't know if this is possible but
doing kfree(kmalloc(1, GFP_KERNEL_ACCOUNT)) after releasing the lock does the
job (even though it's dirty).
Let me know what you think
Alexandre Negrel
https://www.negrel.dev/
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH] io_uring: make overflowing cqe subject to OOM
2025-12-30 14:50 ` Alexandre Negrel
@ 2025-12-30 16:01 ` Jens Axboe
0 siblings, 0 replies; 5+ messages in thread
From: Jens Axboe @ 2025-12-30 16:01 UTC (permalink / raw)
To: Alexandre Negrel; +Cc: io-uring, linux-kernel
On 12/30/25 7:50 AM, Alexandre Negrel wrote:
>> I'm assuming the issue here is that memcg will look at __GFP_HIGH
>> somehow and allow it to proceed?
>
> Exactly, the allocation succeed even though it exceed cgroup limits.
> After digging through try_charge_memcg(), it seems that OOM killer
> isn't involved unless __GFP_DIRECT_RECLAIM bit is set (see
> gfpflags_allow_blocking).
>
> https://github.com/torvalds/linux/blob/8640b74557fc8b4c300030f6ccb8cd078f665ec8/mm/memcontrol.c#L2329
> https://github.com/torvalds/linux/blob/8640b74557fc8b4c300030f6ccb8cd078f665ec8/include/linux/gfp.h#L38
>
>> In any case, then below should then do the same. Can you test?
>
> I tried it and it seems to fix the issue but in a different way.
> try_charge_memcg now returns -ENOMEM and the allocation failed. The
> completion queue entry is "dropped on the floor" in
> io_cqring_add_overflow.
>
> So I see 3 options here:
> * use GFP_NOWAIT if dropping CQE is ok
We're utterly out of memory at that point, so something has to give. We
can't invent memory out of thin air. Hence dropping the event, and
logging it as such, is imho the way to go. Same thing would've happened
with GFP_ATOMIC, just a bit earlier in the process.
It's worth noting that this is extreme circumstances - the kernel is
completely out of memory, and this will cause various spurious failures
to complete syscalls or other events. Additionally, this is the non
DEFER_TASKRUN case, which is what people should be using anyway.
> * allocate using GFP_KERNEL_ACCOUNT without holding the lock then adding
> overflowing entries while holding the completion_lock (iterating twice over
> compl_reqs)
Only viable way to do that would be to allocate it upfront, which is a
huge waste of time for the normal case where the CQ ring isn't
overflowing. We should not optimize for the slow/broken case, where
userspace overflows the ring.
> * charge memory after releasing the lock. I don't know if this is possible but
> doing kfree(kmalloc(1, GFP_KERNEL_ACCOUNT)) after releasing the lock does the
> job (even though it's dirty).
And that's definitely a no-go as well.
--
Jens Axboe
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2025-12-30 16:01 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-12-29 20:19 [PATCH] io_uring: make overflowing cqe subject to OOM Alexandre Negrel
2025-12-30 0:23 ` Jens Axboe
2025-12-30 14:50 ` Alexandre Negrel
2025-12-30 16:01 ` Jens Axboe
2025-12-30 7:15 ` [syzbot ci] " syzbot ci
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox