public inbox for io-uring@vger.kernel.org
 help / color / mirror / Atom feed
From: Pavel Begunkov <asml.silence@gmail.com>
To: Jens Axboe <axboe@kernel.dk>, io-uring@vger.kernel.org
Subject: Re: [PATCH 2/4] io_uring: move locking inside overflow posting
Date: Thu, 15 May 2025 12:04:32 +0100	[thread overview]
Message-ID: <f70222a4-b743-48a8-8ea9-c971e0dcf64f@gmail.com> (raw)
In-Reply-To: <a645a6a2-d722-4ef1-bdfd-3701ab315584@kernel.dk>

On 5/14/25 22:52, Jens Axboe wrote:
> Since sometimes it's easier to talk in code that in English, something
> like the below (just applied on top, and utterly untested) I think is
> much cleaner. Didn't spend a lot of time on it, I'm sure it could get
> condensed down some more with a helper or something. But it keeps the
> locking in the caller, while still allowing GFP_KERNEL alloc for
> lockless_cq.
> 
> Somewhat unrelated, but also fills in an io_cqe and passes in for the
> non-cqe32 parts, just for readability's sake.
> 
> 
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index da1075b66a87..9b6d09633a29 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -744,44 +744,12 @@ static bool __io_cqring_event_overflow(struct io_ring_ctx *ctx,
>   	return true;
>   }
>   
> -static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, bool locked,
> -				     u64 user_data, s32 res, u32 cflags,
> -				     u64 extra1, u64 extra2)
> +static void io_cqring_event_overflow(struct io_ring_ctx *ctx,
> +				     struct io_overflow_cqe *ocqe)
>   {
> -	struct io_overflow_cqe *ocqe;
> -	size_t ocq_size = sizeof(struct io_overflow_cqe);
> -	bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
> -	gfp_t gfp = GFP_KERNEL_ACCOUNT;
> -	bool queued;
> -
> -	io_lockdep_assert_cq_locked(ctx);
> -
> -	if (is_cqe32)
> -		ocq_size += sizeof(struct io_uring_cqe);
> -	if (locked)
> -		gfp = GFP_ATOMIC | __GFP_ACCOUNT;
> -
> -	ocqe = kmalloc(ocq_size, gfp);
> -	trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe);
> -
> -	if (ocqe) {
> -		ocqe->cqe.user_data = user_data;
> -		ocqe->cqe.res = res;
> -		ocqe->cqe.flags = cflags;
> -		if (is_cqe32) {
> -			ocqe->cqe.big_cqe[0] = extra1;
> -			ocqe->cqe.big_cqe[1] = extra2;
> -		}
> -	}
> -
> -	if (locked) {
> -		queued = __io_cqring_event_overflow(ctx, ocqe);
> -	} else {
> -		spin_lock(&ctx->completion_lock);
> -		queued = __io_cqring_event_overflow(ctx, ocqe);
> -		spin_unlock(&ctx->completion_lock);
> -	}
> -	return queued;
> +	spin_lock(&ctx->completion_lock);
> +	__io_cqring_event_overflow(ctx, ocqe);
> +	spin_unlock(&ctx->completion_lock);
>   }
>   
>   /*
> @@ -842,15 +810,49 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
>   	return false;
>   }
>   
> +static struct io_overflow_cqe *io_get_ocqe(struct io_ring_ctx *ctx,
> +					   struct io_cqe *cqe, u64 extra1,
> +					   u64 extra2, gfp_t gfp)
> +{
> +	size_t ocq_size = sizeof(struct io_overflow_cqe);
> +	bool is_cqe32 = ctx->flags & IORING_SETUP_CQE32;
> +	struct io_overflow_cqe *ocqe;
> +
> +	if (is_cqe32)
> +		ocq_size += sizeof(struct io_uring_cqe);
> +
> +	ocqe = kmalloc(ocq_size, gfp);
> +	trace_io_uring_cqe_overflow(ctx, cqe->user_data, cqe->res, cqe->flags, ocqe);
> +	if (ocqe) {
> +		ocqe->cqe.user_data = cqe->user_data;
> +		ocqe->cqe.res = cqe->res;
> +		ocqe->cqe.flags = cqe->flags;
> +		if (is_cqe32) {
> +			ocqe->cqe.big_cqe[0] = extra1;
> +			ocqe->cqe.big_cqe[1] = extra2;
> +		}
> +	}
> +
> +	return ocqe;
> +}
> +
>   bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
>   {
>   	bool filled;
>   
>   	io_cq_lock(ctx);
>   	filled = io_fill_cqe_aux(ctx, user_data, res, cflags);
> -	if (!filled)
> -		filled = io_cqring_event_overflow(ctx, true,
> -						  user_data, res, cflags, 0, 0);
> +	if (unlikely(!filled)) {
> +		struct io_cqe cqe = {
> +			.user_data	= user_data,
> +			.res		= res,
> +			.flags		= cflags
> +		};
> +		struct io_overflow_cqe *ocqe;
> +
> +		ocqe = io_get_ocqe(ctx, &cqe, 0, 0, GFP_ATOMIC);
> +		filled = __io_cqring_event_overflow(ctx, ocqe);
> +	}
>   	io_cq_unlock_post(ctx);
>   	return filled;
>   }
> @@ -864,8 +866,17 @@ void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
>   	lockdep_assert_held(&ctx->uring_lock);
>   	lockdep_assert(ctx->lockless_cq);
>   
> -	if (!io_fill_cqe_aux(ctx, user_data, res, cflags))
> -		io_cqring_event_overflow(ctx, false, user_data, res, cflags, 0, 0);
> +	if (!io_fill_cqe_aux(ctx, user_data, res, cflags)) {
> +		struct io_cqe cqe = {
> +			.user_data	= user_data,
> +			.res		= res,
> +			.flags		= cflags
> +		};
> +		struct io_overflow_cqe *ocqe;
> +
> +		ocqe = io_get_ocqe(ctx, &cqe, 0, 0, GFP_KERNEL);
> +		io_cqring_event_overflow(ctx, ocqe);
> +	}
>   
>   	ctx->submit_state.cq_flush = true;
>   }
> @@ -1437,6 +1448,20 @@ static void io_free_batch_list(struct io_ring_ctx *ctx,
>   	} while (node);
>   }
>   
> +static __cold void io_cqe_overflow_fill(struct io_ring_ctx *ctx,
> +					struct io_kiocb *req)
> +{
> +	gfp_t gfp = ctx->lockless_cq ? GFP_KERNEL : GFP_ATOMIC;
> +	struct io_overflow_cqe *ocqe;
> +
> +	ocqe = io_get_ocqe(ctx, &req->cqe, req->big_cqe.extra1, req->big_cqe.extra2, gfp);
> +	if (ctx->lockless_cq)
> +		io_cqring_event_overflow(ctx, ocqe);
> +	else
> +		__io_cqring_event_overflow(ctx, ocqe);
> +	memset(&req->big_cqe, 0, sizeof(req->big_cqe));
> +}

Implicitly passing the locking state b/w functions is much worse. And
instead being contained in a single helper all callers need to care
about some allocator function, which doesn't help in making it cleaner
and also contributes to bloating. Let's just leave How it's already
in the tree.

-- 
Pavel Begunkov


  reply	other threads:[~2025-05-15 11:03 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-05-14  8:07 [PATCH 0/4] overflow completion enhancements Pavel Begunkov
2025-05-14  8:07 ` [PATCH 1/4] io_uring: open code io_req_cqe_overflow() Pavel Begunkov
2025-05-14  8:07 ` [PATCH 2/4] io_uring: move locking inside overflow posting Pavel Begunkov
2025-05-14 16:42   ` Jens Axboe
2025-05-14 17:18     ` Pavel Begunkov
2025-05-14 19:25       ` Jens Axboe
2025-05-14 20:00         ` Pavel Begunkov
2025-05-14 20:05           ` Jens Axboe
2025-05-14 21:52             ` Jens Axboe
2025-05-15 11:04               ` Pavel Begunkov [this message]
2025-05-14  8:07 ` [PATCH 3/4] io_uring: alloc overflow entry before locking Pavel Begunkov
2025-05-14  8:07 ` [PATCH 4/4] io_uring: add lockdep warning for overflow posting Pavel Begunkov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=f70222a4-b743-48a8-8ea9-c971e0dcf64f@gmail.com \
    --to=asml.silence@gmail.com \
    --cc=axboe@kernel.dk \
    --cc=io-uring@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox