public inbox for io-uring@vger.kernel.org
 help / color / mirror / Atom feed
From: "Diangang Li" <lidiangang@bytedance.com>
To: "Jens Axboe" <axboe@kernel.dk>,
	"Fengnan Chang" <fengnanchang@gmail.com>,
	 <asml.silence@gmail.com>, <io-uring@vger.kernel.org>
Cc: "Fengnan Chang" <changfengnan@bytedance.com>
Subject: Re: [RFC PATCH 2/2] io_uring: fix io may accumulation in poll mode
Date: Fri, 12 Dec 2025 21:32:37 +0800	[thread overview]
Message-ID: <b640d708-6270-4946-916d-350d323f1678@bytedance.com> (raw)
In-Reply-To: <f763dcd7-dcb3-4cc5-a567-f922cda91ca2@kernel.dk>

On 2025/12/12 13:11, Jens Axboe wrote:
> On 12/11/25 7:12 PM, Fengnan Chang wrote:
>>
>>
>> ? 2025/12/12 09:53, Jens Axboe ??:
>>> On 12/11/25 6:41 PM, Fengnan Chang wrote:
>>>> Oh, we can't add nr_events == iob.nr_reqs check, if
>>>> blk_mq_add_to_batch add failed, completed IO will not add into iob,
>>>> iob.nr_reqs will be 0, this may cause io hang.
>>> Indeed, won't work as-is.
>>>
>>> I do think we're probably making a bigger deal out of the full loop than
>>> necessary. At least I'd be perfectly happy with just the current patch,
>>> performance should be better there than we currently have it. Ideally
>>> we'd have just one loop for polling and catching the completed items,
>>> but that's a bit tricky with the batch completions.
>>
>> Yes, ideally one loop would be enough, but given that there are also
>> multi_queue ctx, that doesn't seem to be possible.
> 
> It's not removing the double loop, but the below could help _only_
> iterate completed requests at the end. Rather than move items between
> the current list at the completion callback, have a separate list just
> for completed requests. Then we can simply iterate that, knowing all of
> them have completed. Gets rid of the ->iopoll_completed as well, and
> then we can move the poll_refs. Not really related at all, obviously
> this patch should be split into multiple pieces.
> 
> This uses a lockless list. But since the producer and consumer are
> generally the same task, that should not add any real overhead. On top
> of the previous one I sent. What do you think?
> 
> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
> index 54fd30abf2b8..2d67d95a64ee 100644
> --- a/include/linux/io_uring_types.h
> +++ b/include/linux/io_uring_types.h
> @@ -317,6 +317,7 @@ struct io_ring_ctx {
>   		 */
>   		bool			poll_multi_queue;
>   		struct list_head	iopoll_list;
> +		struct llist_head	iopoll_complete;
>   
>   		struct io_file_table	file_table;
>   		struct io_rsrc_data	buf_table;
> @@ -672,8 +673,9 @@ struct io_kiocb {
>   	};
>   
>   	u8				opcode;
> -	/* polled IO has completed */
> -	u8				iopoll_completed;
> +
> +	bool				cancel_seq_set;
> +
>   	/*
>   	 * Can be either a fixed buffer index, or used with provided buffers.
>   	 * For the latter, it points to the selected buffer ID.
> @@ -700,6 +702,7 @@ struct io_kiocb {
>   	union {
>   		/* used by request caches, completion batching and iopoll */
>   		struct io_wq_work_node	comp_list;
> +		struct llist_node	iopoll_done_list;
>   		/* cache ->apoll->events */
>   		__poll_t apoll_events;
>   	};
> @@ -707,7 +710,7 @@ struct io_kiocb {
>   	struct io_rsrc_node		*file_node;
>   
>   	atomic_t			refs;
> -	bool				cancel_seq_set;
> +	atomic_t			poll_refs;
>   
>   	/*
>   	 * IOPOLL doesn't use task_work, so use the ->iopoll_node list
> @@ -734,7 +737,6 @@ struct io_kiocb {
>   	/* opcode allocated if it needs to store data for async defer */
>   	void				*async_data;
>   	/* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */
> -	atomic_t			poll_refs;
>   	struct io_kiocb			*link;
>   	/* custom credentials, valid IFF REQ_F_CREDS is set */
>   	const struct cred		*creds;
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index 05a660c97316..5e503a0bfcfc 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -335,6 +335,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
>   	spin_lock_init(&ctx->completion_lock);
>   	raw_spin_lock_init(&ctx->timeout_lock);
>   	INIT_LIST_HEAD(&ctx->iopoll_list);
> +	init_llist_head(&ctx->iopoll_complete);
>   	INIT_LIST_HEAD(&ctx->defer_list);
>   	INIT_LIST_HEAD(&ctx->timeout_list);
>   	INIT_LIST_HEAD(&ctx->ltimeout_list);
> diff --git a/io_uring/rw.c b/io_uring/rw.c
> index 307f1f39d9f3..ad481ca74a46 100644
> --- a/io_uring/rw.c
> +++ b/io_uring/rw.c
> @@ -604,8 +604,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
>   			req->cqe.res = res;
>   	}
>   
> -	/* order with io_iopoll_complete() checking ->iopoll_completed */
> -	smp_store_release(&req->iopoll_completed, 1);
> +	llist_add(&req->iopoll_done_list, &req->ctx->iopoll_complete);
>   }
>   
>   static inline void io_rw_done(struct io_kiocb *req, ssize_t ret)
> @@ -870,7 +869,6 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type)
>   			return -EOPNOTSUPP;
>   		kiocb->private = NULL;
>   		kiocb->ki_flags |= IOCB_HIPRI;
> -		req->iopoll_completed = 0;
>   		if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) {
>   			/* make sure every req only blocks once*/
>   			req->flags &= ~REQ_F_IOPOLL_STATE;
> @@ -1317,7 +1315,8 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
>   {
>   	unsigned int poll_flags = 0;
>   	DEFINE_IO_COMP_BATCH(iob);
> -	struct io_kiocb *req, *tmp;
> +	struct llist_node *node;
> +	struct io_kiocb *req;
>   	int nr_events = 0;
>   
>   	/*
> @@ -1327,17 +1326,12 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
>   	if (ctx->poll_multi_queue || force_nonspin)
>   		poll_flags |= BLK_POLL_ONESHOT;
>   
> +	/*
> +	 * Loop over uncompleted polled IO requests, and poll for them.
> +	 */
>   	list_for_each_entry(req, &ctx->iopoll_list, iopoll_node) {
>   		int ret;
>   
> -		/*
> -		 * Move completed and retryable entries to our local lists.
> -		 * If we find a request that requires polling, break out
> -		 * and complete those lists first, if we have entries there.
> -		 */
> -		if (READ_ONCE(req->iopoll_completed))
> -			break;

Suggest keeping iopoll_completed here to avoid unnecessary subsequent 
polling and to process IRQ-completed requests promptly.

> -
>   		if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL)
>   			ret = io_uring_hybrid_poll(req, &iob, poll_flags);
>   		else
> @@ -1349,24 +1343,25 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
>   			poll_flags |= BLK_POLL_ONESHOT;
>   
>   		/* iopoll may have completed current req */
> -		if (!rq_list_empty(&iob.req_list) ||
> -		    READ_ONCE(req->iopoll_completed))
> +		if (!rq_list_empty(&iob.req_list))
>   			break;
>   	}
>   
>   	if (!rq_list_empty(&iob.req_list))
>   		iob.complete(&iob);
>   
> -	list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, iopoll_node) {
> -		/* order with io_complete_rw_iopoll(), e.g. ->result updates */
> -		if (!smp_load_acquire(&req->iopoll_completed))
> -			continue;
> +	node = llist_del_all(&ctx->iopoll_complete);
> +	while (node) {
> +		struct llist_node *next = node->next;
> +
> +		req = container_of(node, struct io_kiocb, iopoll_done_list);
>   		list_del(&req->iopoll_node);
>   		wq_list_add_tail(&req->comp_list, &ctx->submit_state.compl_reqs);
>   		nr_events++;
>   		req->cqe.flags = io_put_kbuf(req, req->cqe.res, NULL);
>   		if (req->opcode != IORING_OP_URING_CMD)
>   			io_req_rw_cleanup(req, 0);
> +		node = next;
>   	}
>   	if (nr_events)
>   		__io_submit_flush_completions(ctx);
> diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
> index 197474911f04..0841fa541f5d 100644
> --- a/io_uring/uring_cmd.c
> +++ b/io_uring/uring_cmd.c
> @@ -159,8 +159,7 @@ void __io_uring_cmd_done(struct io_uring_cmd *ioucmd, s32 ret, u64 res2,
>   	}
>   	io_req_uring_cleanup(req, issue_flags);
>   	if (req->ctx->flags & IORING_SETUP_IOPOLL) {
> -		/* order with io_iopoll_req_issued() checking ->iopoll_complete */
> -		smp_store_release(&req->iopoll_completed, 1);
> +		llist_add(&req->iopoll_done_list, &req->ctx->iopoll_complete);
>   	} else if (issue_flags & IO_URING_F_COMPLETE_DEFER) {
>   		if (WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED))
>   			return;
> @@ -252,7 +251,6 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
>   		if (!file->f_op->uring_cmd_iopoll)
>   			return -EOPNOTSUPP;
>   		issue_flags |= IO_URING_F_IOPOLL;
> -		req->iopoll_completed = 0;
>   		if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) {
>   			/* make sure every req only blocks once */
>   			req->flags &= ~REQ_F_IOPOLL_STATE;

  parent reply	other threads:[~2025-12-12 13:33 UTC|newest]

Thread overview: 28+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-12-10  8:54 [RFC PATCH 0/2] io_uring: fix io may accumulation in poll mode Fengnan Chang
2025-12-10  8:55 ` [RFC PATCH 1/2] blk-mq: delete task running check in blk_hctx_poll Fengnan Chang
2025-12-10  9:19   ` Jens Axboe
2025-12-10  9:53   ` Jens Axboe
2025-12-10  8:55 ` [RFC PATCH 2/2] io_uring: fix io may accumulation in poll mode Fengnan Chang
2025-12-11  2:15   ` Jens Axboe
2025-12-11  4:10     ` Jens Axboe
2025-12-11  7:38       ` Fengnan
2025-12-11 10:22         ` Jens Axboe
2025-12-11 10:33           ` Jens Axboe
2025-12-11 11:13             ` Fengnan Chang
2025-12-11 11:19               ` Jens Axboe
2025-12-12  1:41             ` Fengnan Chang
2025-12-12  1:53               ` Jens Axboe
2025-12-12  2:12                 ` Fengnan Chang
2025-12-12  5:11                   ` Jens Axboe
2025-12-12  8:58                     ` Jens Axboe
2025-12-12  9:49                       ` Fengnan Chang
2025-12-12 20:22                         ` Jens Axboe
2025-12-12 13:32                     ` Diangang Li [this message]
2025-12-12 20:09                       ` Jens Axboe
2025-12-15  6:25                         ` Diangang Li
2025-12-17 12:34                     ` Diangang Li
2025-12-17 16:25                       ` Jens Axboe
2025-12-19  5:43                         ` Diangang Li
2026-01-09  8:35                           ` Diangang Li
2026-01-09 23:27                             ` Jens Axboe
2025-12-10  9:53 ` (subset) [RFC PATCH 0/2] " Jens Axboe

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=b640d708-6270-4946-916d-350d323f1678@bytedance.com \
    --to=lidiangang@bytedance.com \
    --cc=asml.silence@gmail.com \
    --cc=axboe@kernel.dk \
    --cc=changfengnan@bytedance.com \
    --cc=fengnanchang@gmail.com \
    --cc=io-uring@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox