public inbox for [email protected]
 help / color / mirror / Atom feed
* [PATCH v2 1/1] io_uring: allocate memory for overflowed CQEs
@ 2021-02-23 12:40 Pavel Begunkov
  2021-02-23 14:22 ` Jens Axboe
  2021-02-24  3:06 ` Hao Xu
  0 siblings, 2 replies; 7+ messages in thread
From: Pavel Begunkov @ 2021-02-23 12:40 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Instead of using a request itself for overflowed CQE stashing, allocate
a separate entry. The disadvantage is that the allocation may fail and
it will be accounted as lost (see rings->cq_overflow), so we lose
reliability in case of memory pressure. However, it opens a way for for
multiple CQEs per an SQE and even generating SQE-less CQEs.

Signed-off-by: Pavel Begunkov <[email protected]>
---

v2: rebase

 fs/io_uring.c | 98 +++++++++++++++++++++++----------------------------
 1 file changed, 45 insertions(+), 53 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 3f764f2f2982..cb50dc22b502 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -205,6 +205,11 @@ struct io_mapped_ubuf {
 
 struct io_ring_ctx;
 
+struct io_overflow_cqe {
+	struct io_uring_cqe cqe;
+	struct list_head list;
+};
+
 struct io_rsrc_put {
 	struct list_head list;
 	union {
@@ -1474,41 +1479,33 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
 }
 
 /* Returns true if there are no backlogged entries after the flush */
-static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
-				       struct task_struct *tsk,
-				       struct files_struct *files)
+static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 {
 	struct io_rings *rings = ctx->rings;
-	struct io_kiocb *req, *tmp;
-	struct io_uring_cqe *cqe;
 	unsigned long flags;
 	bool all_flushed, posted;
-	LIST_HEAD(list);
 
 	if (!force && __io_cqring_events(ctx) == rings->cq_ring_entries)
 		return false;
 
 	posted = false;
 	spin_lock_irqsave(&ctx->completion_lock, flags);
-	list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) {
-		if (!io_match_task(req, tsk, files))
-			continue;
+	while (!list_empty(&ctx->cq_overflow_list)) {
+		struct io_uring_cqe *cqe = io_get_cqring(ctx);
+		struct io_overflow_cqe *ocqe;
 
-		cqe = io_get_cqring(ctx);
 		if (!cqe && !force)
 			break;
-
-		list_move(&req->compl.list, &list);
-		if (cqe) {
-			WRITE_ONCE(cqe->user_data, req->user_data);
-			WRITE_ONCE(cqe->res, req->result);
-			WRITE_ONCE(cqe->flags, req->compl.cflags);
-		} else {
-			ctx->cached_cq_overflow++;
+		ocqe = list_first_entry(&ctx->cq_overflow_list,
+					struct io_overflow_cqe, list);
+		if (cqe)
+			memcpy(cqe, &ocqe->cqe, sizeof(*cqe));
+		else
 			WRITE_ONCE(ctx->rings->cq_overflow,
-				   ctx->cached_cq_overflow);
-		}
+				   ++ctx->cached_cq_overflow);
 		posted = true;
+		list_del(&ocqe->list);
+		kfree(ocqe);
 	}
 
 	all_flushed = list_empty(&ctx->cq_overflow_list);
@@ -1523,25 +1520,16 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 	if (posted)
 		io_cqring_ev_posted(ctx);
-
-	while (!list_empty(&list)) {
-		req = list_first_entry(&list, struct io_kiocb, compl.list);
-		list_del(&req->compl.list);
-		io_put_req(req);
-	}
-
 	return all_flushed;
 }
 
-static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
-				     struct task_struct *tsk,
-				     struct files_struct *files)
+static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 {
 	if (test_bit(0, &ctx->cq_check_overflow)) {
 		/* iopoll syncs against uring_lock, not completion_lock */
 		if (ctx->flags & IORING_SETUP_IOPOLL)
 			mutex_lock(&ctx->uring_lock);
-		__io_cqring_overflow_flush(ctx, force, tsk, files);
+		__io_cqring_overflow_flush(ctx, force);
 		if (ctx->flags & IORING_SETUP_IOPOLL)
 			mutex_unlock(&ctx->uring_lock);
 	}
@@ -1564,27 +1552,32 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
 		WRITE_ONCE(cqe->user_data, req->user_data);
 		WRITE_ONCE(cqe->res, res);
 		WRITE_ONCE(cqe->flags, cflags);
-	} else if (ctx->cq_overflow_flushed ||
-		   atomic_read(&req->task->io_uring->in_idle)) {
-		/*
-		 * If we're in ring overflow flush mode, or in task cancel mode,
-		 * then we cannot store the request for later flushing, we need
-		 * to drop it on the floor.
-		 */
-		ctx->cached_cq_overflow++;
-		WRITE_ONCE(ctx->rings->cq_overflow, ctx->cached_cq_overflow);
-	} else {
+		return;
+	}
+	if (!ctx->cq_overflow_flushed &&
+	    !atomic_read(&req->task->io_uring->in_idle)) {
+		struct io_overflow_cqe *ocqe = kmalloc(sizeof(*ocqe), GFP_KERNEL);
+
+		if (!ocqe)
+			goto overflow;
 		if (list_empty(&ctx->cq_overflow_list)) {
 			set_bit(0, &ctx->sq_check_overflow);
 			set_bit(0, &ctx->cq_check_overflow);
 			ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
 		}
-		io_clean_op(req);
-		req->result = res;
-		req->compl.cflags = cflags;
-		refcount_inc(&req->refs);
-		list_add_tail(&req->compl.list, &ctx->cq_overflow_list);
+		ocqe->cqe.user_data = req->user_data;
+		ocqe->cqe.res = res;
+		ocqe->cqe.flags = cflags;
+		list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
+		return;
 	}
+overflow:
+	/*
+	 * If we're in ring overflow flush mode, or in task cancel mode,
+	 * or cannot allocate an overflow entry, then we need to drop it
+	 * on the floor.
+	 */
+	WRITE_ONCE(ctx->rings->cq_overflow, ++ctx->cached_cq_overflow);
 }
 
 static void io_cqring_fill_event(struct io_kiocb *req, long res)
@@ -2407,7 +2400,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
 		 * already triggered a CQE (eg in error).
 		 */
 		if (test_bit(0, &ctx->cq_check_overflow))
-			__io_cqring_overflow_flush(ctx, false, NULL, NULL);
+			__io_cqring_overflow_flush(ctx, false);
 		if (io_cqring_events(ctx))
 			break;
 
@@ -6537,7 +6530,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 
 	/* if we have a backlog and couldn't flush it all, return BUSY */
 	if (test_bit(0, &ctx->sq_check_overflow)) {
-		if (!__io_cqring_overflow_flush(ctx, false, NULL, NULL))
+		if (!__io_cqring_overflow_flush(ctx, false))
 			return -EBUSY;
 	}
 
@@ -6887,7 +6880,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 	int ret;
 
 	do {
-		io_cqring_overflow_flush(ctx, false, NULL, NULL);
+		io_cqring_overflow_flush(ctx, false);
 		if (io_cqring_events(ctx) >= min_events)
 			return 0;
 		if (!io_run_task_work())
@@ -6918,7 +6911,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
 	trace_io_uring_cqring_wait(ctx, min_events);
 	do {
-		io_cqring_overflow_flush(ctx, false, NULL, NULL);
+		io_cqring_overflow_flush(ctx, false);
 		prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
 						TASK_INTERRUPTIBLE);
 		ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
@@ -8527,7 +8520,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
 	/* if force is set, the ring is going away. always drop after that */
 	ctx->cq_overflow_flushed = 1;
 	if (ctx->rings)
-		__io_cqring_overflow_flush(ctx, true, NULL, NULL);
+		__io_cqring_overflow_flush(ctx, true);
 	idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
 	mutex_unlock(&ctx->uring_lock);
 
@@ -8635,7 +8628,6 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 		ret |= io_poll_remove_all(ctx, task, files);
 		ret |= io_kill_timeouts(ctx, task, files);
 		ret |= io_run_task_work();
-		io_cqring_overflow_flush(ctx, true, task, files);
 		if (!ret)
 			break;
 		cond_resched();
@@ -9111,7 +9103,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 	 */
 	ret = 0;
 	if (ctx->flags & IORING_SETUP_SQPOLL) {
-		io_cqring_overflow_flush(ctx, false, NULL, NULL);
+		io_cqring_overflow_flush(ctx, false);
 
 		ret = -EOWNERDEAD;
 		if (unlikely(ctx->sqo_dead))
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH v2 1/1] io_uring: allocate memory for overflowed CQEs
  2021-02-23 12:40 [PATCH v2 1/1] io_uring: allocate memory for overflowed CQEs Pavel Begunkov
@ 2021-02-23 14:22 ` Jens Axboe
  2021-02-23 15:36   ` Pavel Begunkov
  2021-02-24  3:06 ` Hao Xu
  1 sibling, 1 reply; 7+ messages in thread
From: Jens Axboe @ 2021-02-23 14:22 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring

On 2/23/21 5:40 AM, Pavel Begunkov wrote:
> +	if (!ctx->cq_overflow_flushed &&
> +	    !atomic_read(&req->task->io_uring->in_idle)) {
> +		struct io_overflow_cqe *ocqe = kmalloc(sizeof(*ocqe), GFP_KERNEL);

This needs to be GFP_ATOMIC. And we should probably make it GFP_ATOMIC |
__GFP_ACCOUNT to ensure it gets accounted, since the list could grow
pretty big potentially.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2 1/1] io_uring: allocate memory for overflowed CQEs
  2021-02-23 14:22 ` Jens Axboe
@ 2021-02-23 15:36   ` Pavel Begunkov
  0 siblings, 0 replies; 7+ messages in thread
From: Pavel Begunkov @ 2021-02-23 15:36 UTC (permalink / raw)
  To: Jens Axboe, io-uring

On 23/02/2021 14:22, Jens Axboe wrote:
> On 2/23/21 5:40 AM, Pavel Begunkov wrote:
>> +	if (!ctx->cq_overflow_flushed &&
>> +	    !atomic_read(&req->task->io_uring->in_idle)) {
>> +		struct io_overflow_cqe *ocqe = kmalloc(sizeof(*ocqe), GFP_KERNEL);
> 
> This needs to be GFP_ATOMIC. And we should probably make it GFP_ATOMIC |
> __GFP_ACCOUNT to ensure it gets accounted, since the list could grow
> pretty big potentially.

Agree. I was thinking about it being naturally limited by inflight reqs,
but well we're speaking about multiple CQEs...

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2 1/1] io_uring: allocate memory for overflowed CQEs
  2021-02-23 12:40 [PATCH v2 1/1] io_uring: allocate memory for overflowed CQEs Pavel Begunkov
  2021-02-23 14:22 ` Jens Axboe
@ 2021-02-24  3:06 ` Hao Xu
  2021-02-24  3:18   ` Jens Axboe
  1 sibling, 1 reply; 7+ messages in thread
From: Hao Xu @ 2021-02-24  3:06 UTC (permalink / raw)
  To: Pavel Begunkov, Jens Axboe, io-uring

在 2021/2/23 下午8:40, Pavel Begunkov 写道:
> Instead of using a request itself for overflowed CQE stashing, allocate
> a separate entry. The disadvantage is that the allocation may fail and
> it will be accounted as lost (see rings->cq_overflow), so we lose
> reliability in case of memory pressure. However, it opens a way for for
> multiple CQEs per an SQE and even generating SQE-less CQEs >
> Signed-off-by: Pavel Begunkov <[email protected]>
> ---
Hi Pavel,
Allow me to ask a stupid question, why do we need to support multiple 
CQEs per SQE or even SQE-less CQEs in the future?

Thanks,
Hao

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2 1/1] io_uring: allocate memory for overflowed CQEs
  2021-02-24  3:06 ` Hao Xu
@ 2021-02-24  3:18   ` Jens Axboe
  2021-02-24  3:47     ` Hao Xu
  2021-02-24  9:39     ` Pavel Begunkov
  0 siblings, 2 replies; 7+ messages in thread
From: Jens Axboe @ 2021-02-24  3:18 UTC (permalink / raw)
  To: Hao Xu, Pavel Begunkov, io-uring

On 2/23/21 8:06 PM, Hao Xu wrote:
> 在 2021/2/23 下午8:40, Pavel Begunkov 写道:
>> Instead of using a request itself for overflowed CQE stashing, allocate
>> a separate entry. The disadvantage is that the allocation may fail and
>> it will be accounted as lost (see rings->cq_overflow), so we lose
>> reliability in case of memory pressure. However, it opens a way for for
>> multiple CQEs per an SQE and even generating SQE-less CQEs >
>> Signed-off-by: Pavel Begunkov <[email protected]>
>> ---
> Hi Pavel,
> Allow me to ask a stupid question, why do we need to support multiple 
> CQEs per SQE or even SQE-less CQEs in the future?

Not a stupid question at all, since it's not something we've done
before. There's been discussion about this in the past, in the presence
of the zero copy IO where we ideally want to post two CQEs for an SQE.
Most recently I've been playing with multishot poll support, where a
POLL_ADD will stay active after triggering. Hence you could be posting
many CQEs for that SQE, over the life time of the request.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2 1/1] io_uring: allocate memory for overflowed CQEs
  2021-02-24  3:18   ` Jens Axboe
@ 2021-02-24  3:47     ` Hao Xu
  2021-02-24  9:39     ` Pavel Begunkov
  1 sibling, 0 replies; 7+ messages in thread
From: Hao Xu @ 2021-02-24  3:47 UTC (permalink / raw)
  To: Jens Axboe, Pavel Begunkov, io-uring

在 2021/2/24 上午11:18, Jens Axboe 写道:
> On 2/23/21 8:06 PM, Hao Xu wrote:
>> 在 2021/2/23 下午8:40, Pavel Begunkov 写道:
>>> Instead of using a request itself for overflowed CQE stashing, allocate
>>> a separate entry. The disadvantage is that the allocation may fail and
>>> it will be accounted as lost (see rings->cq_overflow), so we lose
>>> reliability in case of memory pressure. However, it opens a way for for
>>> multiple CQEs per an SQE and even generating SQE-less CQEs >
>>> Signed-off-by: Pavel Begunkov <[email protected]>
>>> ---
>> Hi Pavel,
>> Allow me to ask a stupid question, why do we need to support multiple
>> CQEs per SQE or even SQE-less CQEs in the future?
> 
> Not a stupid question at all, since it's not something we've done
> before. There's been discussion about this in the past, in the presence
> of the zero copy IO where we ideally want to post two CQEs for an SQE.
> Most recently I've been playing with multishot poll support, where a
> POLL_ADD will stay active after triggering. Hence you could be posting
> many CQEs for that SQE, over the life time of the request.
> 
I see, super thanks Jens.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2 1/1] io_uring: allocate memory for overflowed CQEs
  2021-02-24  3:18   ` Jens Axboe
  2021-02-24  3:47     ` Hao Xu
@ 2021-02-24  9:39     ` Pavel Begunkov
  1 sibling, 0 replies; 7+ messages in thread
From: Pavel Begunkov @ 2021-02-24  9:39 UTC (permalink / raw)
  To: Jens Axboe, Hao Xu, io-uring

On 24/02/2021 03:18, Jens Axboe wrote:
> On 2/23/21 8:06 PM, Hao Xu wrote:
>> 在 2021/2/23 下午8:40, Pavel Begunkov 写道:
>>> Instead of using a request itself for overflowed CQE stashing, allocate
>>> a separate entry. The disadvantage is that the allocation may fail and
>>> it will be accounted as lost (see rings->cq_overflow), so we lose
>>> reliability in case of memory pressure. However, it opens a way for for
>>> multiple CQEs per an SQE and even generating SQE-less CQEs >
>>> Signed-off-by: Pavel Begunkov <[email protected]>
>>> ---
>> Hi Pavel,
>> Allow me to ask a stupid question, why do we need to support multiple 
>> CQEs per SQE or even SQE-less CQEs in the future?
> 
> Not a stupid question at all, since it's not something we've done
> before. There's been discussion about this in the past, in the presence
> of the zero copy IO where we ideally want to post two CQEs for an SQE.
> Most recently I've been playing with multishot poll support, where a
> POLL_ADD will stay active after triggering. Hence you could be posting
> many CQEs for that SQE, over the life time of the request.

Yep, in addition should be useful for eBPF requests.

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2021-02-24  9:46 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2021-02-23 12:40 [PATCH v2 1/1] io_uring: allocate memory for overflowed CQEs Pavel Begunkov
2021-02-23 14:22 ` Jens Axboe
2021-02-23 15:36   ` Pavel Begunkov
2021-02-24  3:06 ` Hao Xu
2021-02-24  3:18   ` Jens Axboe
2021-02-24  3:47     ` Hao Xu
2021-02-24  9:39     ` Pavel Begunkov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox