[PATCH AUTOSEL 6.0 44/67] io_uring: fix CQE reordering

public inbox for [email protected]
 help / color / mirror / Atom feed

* [PATCH AUTOSEL 6.0 44/67] io_uring: fix CQE reordering
       [not found] <[email protected]>
@ 2022-10-13  0:15 ` Sasha Levin
  2022-10-13  0:15 ` [PATCH AUTOSEL 6.0 56/67] block: replace blk_queue_nowait with bdev_nowait Sasha Levin
  1 sibling, 0 replies; 2+ messages in thread
From: Sasha Levin @ 2022-10-13  0:15 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Pavel Begunkov, Dylan Yudaken, Jens Axboe, Sasha Levin, io-uring

From: Pavel Begunkov <[email protected]>

[ Upstream commit aa1df3a360a0c50e0f0086a785d75c2785c29967 ]

Overflowing CQEs may result in reordering, which is buggy in case of
links, F_MORE and so on. If we guarantee that we don't reorder for
the unlikely event of a CQ ring overflow, then we can further extend
this to not have to terminate multishot requests if it happens. For
other operations, like zerocopy sends, we have no choice but to honor
CQE ordering.

Reported-by: Dylan Yudaken <[email protected]>
Signed-off-by: Pavel Begunkov <[email protected]>
Link: https://lore.kernel.org/r/ec3bc55687b0768bbe20fb62d7d06cfced7d7e70.1663892031.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <[email protected]>
Signed-off-by: Sasha Levin <[email protected]>
---
 io_uring/io_uring.c | 12 ++++++++++--
 io_uring/io_uring.h | 12 +++++++++---
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 242d896c00f3..13af6b56ebd2 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -567,7 +567,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 
 	io_cq_lock(ctx);
 	while (!list_empty(&ctx->cq_overflow_list)) {
-		struct io_uring_cqe *cqe = io_get_cqe(ctx);
+		struct io_uring_cqe *cqe = io_get_cqe_overflow(ctx, true);
 		struct io_overflow_cqe *ocqe;
 
 		if (!cqe && !force)
@@ -694,12 +694,19 @@ bool io_req_cqe_overflow(struct io_kiocb *req)
  * control dependency is enough as we're using WRITE_ONCE to
  * fill the cq entry
  */
-struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx)
+struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow)
 {
 	struct io_rings *rings = ctx->rings;
 	unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
 	unsigned int free, queued, len;
 
+	/*
+	 * Posting into the CQ when there are pending overflowed CQEs may break
+	 * ordering guarantees, which will affect links, F_MORE users and more.
+	 * Force overflow the completion.
+	 */
+	if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)))
+		return NULL;
 
 	/* userspace may cheat modifying the tail, be safe and do min */
 	queued = min(__io_cqring_events(ctx), ctx->cq_entries);
@@ -2228,6 +2235,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 
 	do {
 		io_cqring_overflow_flush(ctx);
+
 		if (io_cqring_events(ctx) >= min_events)
 			return 0;
 		if (!io_run_task_work())
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 2f73f83af960..45809ae6f64e 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -24,7 +24,7 @@ enum {
 	IOU_STOP_MULTISHOT	= -ECANCELED,
 };
 
-struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx);
+struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow);
 bool io_req_cqe_overflow(struct io_kiocb *req);
 int io_run_task_work_sig(void);
 void io_req_complete_failed(struct io_kiocb *req, s32 res);
@@ -91,7 +91,8 @@ static inline void io_cq_lock(struct io_ring_ctx *ctx)
 
 void io_cq_unlock_post(struct io_ring_ctx *ctx);
 
-static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
+static inline struct io_uring_cqe *io_get_cqe_overflow(struct io_ring_ctx *ctx,
+						       bool overflow)
 {
 	if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) {
 		struct io_uring_cqe *cqe = ctx->cqe_cached;
@@ -103,7 +104,12 @@ static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
 		return cqe;
 	}
 
-	return __io_get_cqe(ctx);
+	return __io_get_cqe(ctx, overflow);
+}
+
+static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
+{
+	return io_get_cqe_overflow(ctx, false);
 }
 
 static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx,
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 2+ messages in thread

* [PATCH AUTOSEL 6.0 56/67] block: replace blk_queue_nowait with bdev_nowait
       [not found] <[email protected]>
  2022-10-13  0:15 ` [PATCH AUTOSEL 6.0 44/67] io_uring: fix CQE reordering Sasha Levin
@ 2022-10-13  0:15 ` Sasha Levin
  1 sibling, 0 replies; 2+ messages in thread
From: Sasha Levin @ 2022-10-13  0:15 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Christoph Hellwig, Pankaj Raghav, Jens Axboe, Sasha Levin, agk,
	snitzer, dm-devel, song, linux-block, linux-raid, io-uring

From: Christoph Hellwig <[email protected]>

[ Upstream commit 568ec936bf1384fc15873908c96a9aeb62536edb ]

Replace blk_queue_nowait with a bdev_nowait helpers that takes the
block_device given that the I/O submission path should not have to
look into the request_queue.

Signed-off-by: Christoph Hellwig <[email protected]>
Reviewed-by: Pankaj Raghav <[email protected]>
Link: https://lore.kernel.org/r/[email protected]
Signed-off-by: Jens Axboe <[email protected]>
Signed-off-by: Sasha Levin <[email protected]>
---
 block/blk-core.c       | 2 +-
 drivers/md/dm-table.c  | 4 +---
 drivers/md/md.c        | 4 ++--
 include/linux/blkdev.h | 6 +++++-
 io_uring/io_uring.c    | 2 +-
 5 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 651057c4146b..4ec669b0eadc 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -717,7 +717,7 @@ void submit_bio_noacct(struct bio *bio)
 	 * For a REQ_NOWAIT based request, return -EOPNOTSUPP
 	 * if queue does not support NOWAIT.
 	 */
-	if ((bio->bi_opf & REQ_NOWAIT) && !blk_queue_nowait(q))
+	if ((bio->bi_opf & REQ_NOWAIT) && !bdev_nowait(bdev))
 		goto not_supported;
 
 	if (should_fail_bio(bio))
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 332f96b58252..d8034ff0cb24 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1856,9 +1856,7 @@ static bool dm_table_supports_write_zeroes(struct dm_table *t)
 static int device_not_nowait_capable(struct dm_target *ti, struct dm_dev *dev,
 				     sector_t start, sector_t len, void *data)
 {
-	struct request_queue *q = bdev_get_queue(dev->bdev);
-
-	return !blk_queue_nowait(q);
+	return !bdev_nowait(dev->bdev);
 }
 
 static bool dm_table_supports_nowait(struct dm_table *t)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 729be2c5296c..72ad352cb41a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5845,7 +5845,7 @@ int md_run(struct mddev *mddev)
 			}
 		}
 		sysfs_notify_dirent_safe(rdev->sysfs_state);
-		nowait = nowait && blk_queue_nowait(bdev_get_queue(rdev->bdev));
+		nowait = nowait && bdev_nowait(rdev->bdev);
 	}
 
 	if (!bioset_initialized(&mddev->bio_set)) {
@@ -6982,7 +6982,7 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
 	 * If the new disk does not support REQ_NOWAIT,
 	 * disable on the whole MD.
 	 */
-	if (!blk_queue_nowait(bdev_get_queue(rdev->bdev))) {
+	if (!bdev_nowait(rdev->bdev)) {
 		pr_info("%s: Disabling nowait because %pg does not support nowait\n",
 			mdname(mddev), rdev->bdev);
 		blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->queue);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 84b13fdd34a7..4750772ef228 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -618,7 +618,6 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
 #define blk_queue_quiesced(q)	test_bit(QUEUE_FLAG_QUIESCED, &(q)->queue_flags)
 #define blk_queue_pm_only(q)	atomic_read(&(q)->pm_only)
 #define blk_queue_registered(q)	test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags)
-#define blk_queue_nowait(q)	test_bit(QUEUE_FLAG_NOWAIT, &(q)->queue_flags)
 #define blk_queue_sq_sched(q)	test_bit(QUEUE_FLAG_SQ_SCHED, &(q)->queue_flags)
 
 extern void blk_set_pm_only(struct request_queue *q);
@@ -1280,6 +1279,11 @@ static inline bool bdev_fua(struct block_device *bdev)
 	return test_bit(QUEUE_FLAG_FUA, &bdev_get_queue(bdev)->queue_flags);
 }
 
+static inline bool bdev_nowait(struct block_device *bdev)
+{
+	return test_bit(QUEUE_FLAG_NOWAIT, &bdev_get_queue(bdev)->queue_flags);
+}
+
 static inline enum blk_zoned_model bdev_zoned_model(struct block_device *bdev)
 {
 	struct request_queue *q = bdev_get_queue(bdev);
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 13af6b56ebd2..c13122a87c40 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1384,7 +1384,7 @@ static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags)
 
 static bool io_bdev_nowait(struct block_device *bdev)
 {
-	return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
+	return !bdev || bdev_nowait(bdev);
 }
 
 /*
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2022-10-13  0:22 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <[email protected]>
2022-10-13  0:15 ` [PATCH AUTOSEL 6.0 44/67] io_uring: fix CQE reordering Sasha Levin
2022-10-13  0:15 ` [PATCH AUTOSEL 6.0 56/67] block: replace blk_queue_nowait with bdev_nowait Sasha Levin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox