public inbox for [email protected]
 help / color / mirror / Atom feed
From: Ming Lei <[email protected]>
To: Jens Axboe <[email protected]>, [email protected]
Cc: [email protected],
	Pavel Begunkov <[email protected]>,
	Kevin Wolf <[email protected]>, Ming Lei <[email protected]>
Subject: [PATCH 5/9] io_uring: support SQE group
Date: Mon,  8 Apr 2024 09:03:18 +0800	[thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>

SQE group is defined as one chain of SQEs starting with the first sqe that
has IOSQE_EXT_SQE_GROUP set, and ending with the first subsequent sqe that
doesn't have it set, and it is similar with chain of linked sqes.

The 1st SQE is group leader, and the other SQEs are group member. The group
leader is always freed after all members are completed. Group members
aren't submitted until the group leader is completed, and there isn't any
dependency among group members, and IOSQE_IO_LINK can't be set for group
members, same with IOSQE_IO_DRAIN.

Typically the group leader provides or makes resource, and the other members
consume the resource, such as scenario of multiple backup, the 1st SQE is to
read data from source file into fixed buffer, the other SQEs write data from
the same buffer into other destination files. SQE group provides very
efficient way to complete this task: 1) fs write SQEs and fs read SQE can be
submitted in single syscall, no need to submit fs read SQE first, and wait
until read SQE is completed, 2) no need to link all write SQEs together, then
write SQEs can be submitted to files concurrently. Meantime application is
simplified a lot in this way.

Another use case is to for supporting generic device zero copy:

- the lead SQE is for providing device buffer, which is owned by device or
  kernel, can't be cross userspace, otherwise easy to cause leak for devil
  application or panic

- member SQEs reads or writes concurrently against the buffer provided by lead
  SQE

Signed-off-by: Ming Lei <[email protected]>
---
 include/linux/io_uring_types.h |  10 ++
 include/uapi/linux/io_uring.h  |   4 +
 io_uring/io_uring.c            | 218 ++++++++++++++++++++++++++++++---
 io_uring/io_uring.h            |  17 ++-
 4 files changed, 231 insertions(+), 18 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 67347e5d06ec..7ce4a2d4a8b8 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -444,6 +444,7 @@ enum {
 	REQ_F_BUFFER_SELECT_BIT	= IOSQE_BUFFER_SELECT_BIT,
 	REQ_F_CQE_SKIP_BIT	= IOSQE_CQE_SKIP_SUCCESS_BIT,
 	REQ_F_SQE_EXT_FLAGS_BIT	= IOSQE_HAS_EXT_FLAGS_BIT,
+	REQ_F_SQE_GROUP_BIT	= 8 + IOSQE_EXT_SQE_GROUP_BIT,
 
 	/* first 2 bytes are taken by user flags, shift it to not overlap */
 	REQ_F_FAIL_BIT		= 16,
@@ -474,6 +475,7 @@ enum {
 	REQ_F_CAN_POLL_BIT,
 	REQ_F_BL_EMPTY_BIT,
 	REQ_F_BL_NO_RECYCLE_BIT,
+	REQ_F_SQE_GROUP_LEAD_BIT,
 
 	/* not a real bit, just to check we're not overflowing the space */
 	__REQ_F_LAST_BIT,
@@ -497,6 +499,8 @@ enum {
 	REQ_F_BUFFER_SELECT	= IO_REQ_FLAG(REQ_F_BUFFER_SELECT_BIT),
 	/* IOSQE_CQE_SKIP_SUCCESS */
 	REQ_F_CQE_SKIP		= IO_REQ_FLAG(REQ_F_CQE_SKIP_BIT),
+	/* IOSQE_EXT_SQE_GROUP */
+	REQ_F_SQE_GROUP		= IO_REQ_FLAG(REQ_F_SQE_GROUP_BIT),
 
 	/* fail rest of links */
 	REQ_F_FAIL		= IO_REQ_FLAG(REQ_F_FAIL_BIT),
@@ -552,6 +556,8 @@ enum {
 	REQ_F_BL_EMPTY		= IO_REQ_FLAG(REQ_F_BL_EMPTY_BIT),
 	/* don't recycle provided buffers for this request */
 	REQ_F_BL_NO_RECYCLE	= IO_REQ_FLAG(REQ_F_BL_NO_RECYCLE_BIT),
+	/* sqe group lead */
+	REQ_F_SQE_GROUP_LEAD	= IO_REQ_FLAG(REQ_F_SQE_GROUP_LEAD_BIT),
 };
 
 typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts);
@@ -665,6 +671,10 @@ struct io_kiocb {
 		u64			extra1;
 		u64			extra2;
 	} big_cqe;
+
+	/* all SQE group members linked here for group lead */
+	struct io_kiocb			*grp_link;
+	atomic_t			grp_refs;
 };
 
 struct io_overflow_cqe {
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 4847d7cf1ac9..c0d34f2a2c17 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -128,6 +128,8 @@ enum io_uring_sqe_flags_bit {
 	IOSQE_BUFFER_SELECT_BIT,
 	IOSQE_CQE_SKIP_SUCCESS_BIT,
 	IOSQE_HAS_EXT_FLAGS_BIT,
+
+	IOSQE_EXT_SQE_GROUP_BIT = 0,
 };
 
 /*
@@ -152,6 +154,8 @@ enum io_uring_sqe_flags_bit {
  * sqe->uring_cmd_flags for IORING_URING_CMD.
  */
 #define IOSQE_HAS_EXT_FLAGS	(1U << IOSQE_HAS_EXT_FLAGS_BIT)
+/* defines sqe group */
+#define IOSQE_EXT_SQE_GROUP	(1U << IOSQE_EXT_SQE_GROUP_BIT)
 
 /*
  * io_uring_setup() flags
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 4969d21ea2f8..0f41b26723a8 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -147,6 +147,10 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 					 bool cancel_all);
 
 static void io_queue_sqe(struct io_kiocb *req);
+static bool io_get_sqe(struct io_ring_ctx *ctx,
+		const struct io_uring_sqe **sqe);
+static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
+		       const struct io_uring_sqe *sqe);
 
 struct kmem_cache *req_cachep;
 static struct workqueue_struct *iou_wq __ro_after_init;
@@ -925,10 +929,189 @@ bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags)
 	return posted;
 }
 
+static void __io_req_failed(struct io_kiocb *req, s32 res, bool skip_cqe)
+{
+	const struct io_cold_def *def = &io_cold_defs[req->opcode];
+
+	lockdep_assert_held(&req->ctx->uring_lock);
+
+	if (skip_cqe)
+		req->flags |= REQ_F_FAIL | REQ_F_CQE_SKIP;
+	else
+		req_set_fail(req);
+	io_req_set_res(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED));
+	if (def->fail)
+		def->fail(req);
+}
+
+void io_req_defer_failed(struct io_kiocb *req, s32 res)
+	__must_hold(&ctx->uring_lock)
+{
+	__io_req_failed(req, res, false);
+	io_req_complete_defer(req);
+}
+
+static void io_req_defer_failed_sliently(struct io_kiocb *req, s32 res)
+	__must_hold(&ctx->uring_lock)
+{
+	__io_req_failed(req, res, true);
+	io_req_complete_defer(req);
+}
+
+/*
+ * Called after member req is completed, and return the lead request for
+ * caller to fill cqe & free it really
+ */
+static inline struct io_kiocb *io_complete_group_member(struct io_kiocb *req)
+{
+	struct io_kiocb *lead = req->grp_link;
+
+	req->grp_link = NULL;
+	if (lead && atomic_dec_and_test(&lead->grp_refs)) {
+		req->grp_link = NULL;
+		lead->flags &= ~REQ_F_SQE_GROUP_LEAD;
+		return lead;
+	}
+
+	return NULL;
+}
+
+/*
+ * Called after lead req is completed and before posting cqe and freeing,
+ * for issuing member requests
+ */
+void io_complete_group_lead(struct io_kiocb *req, unsigned issue_flags)
+{
+	struct io_kiocb *member = req->grp_link;
+
+	if (!member)
+		return;
+
+	while (member) {
+		struct io_kiocb *next = member->grp_link;
+
+		member->grp_link = req;
+		if (unlikely(req->flags & REQ_F_FAIL)) {
+			/*
+			 * Now group lead is failed, so simply fail members
+			 * with -EIO, and the application can figure out
+			 * the reason from lead's cqe->res
+			 */
+			__io_req_failed(member, -EIO, false);
+
+			if (issue_flags & IO_URING_F_COMPLETE_DEFER)
+				io_req_complete_defer(member);
+			else {
+				member->io_task_work.func = io_req_task_complete;
+				io_req_task_work_add(member);
+			}
+		} else {
+			trace_io_uring_submit_req(member);
+			if ((issue_flags & IO_URING_F_COMPLETE_DEFER) &&
+					!(member->flags & REQ_F_FORCE_ASYNC))
+				io_queue_sqe(member);
+			else
+				io_req_task_queue(member);
+		}
+		member = next;
+	}
+	req->grp_link = NULL;
+}
+
+static bool sqe_is_group_member(const struct io_uring_sqe *sqe)
+{
+	return (READ_ONCE(sqe->flags) & IOSQE_HAS_EXT_FLAGS) &&
+		(READ_ONCE(sqe->ext_flags) & IOSQE_EXT_SQE_GROUP);
+}
+
+/*
+ * Initialize the whole SQE group.
+ *
+ * Walk every member in this group even though failure happens. If the lead
+ * request is failed, CQE is only posted for lead, otherwise, CQE is posted
+ * for every member request. Member requests aren't issued until the lead
+ * is completed, and the lead request won't be freed until all member
+ * requests are completed.
+ *
+ * The whole group shares the link flag in group lead, and other members
+ * aren't allowed to set any LINK flag. And only the lead request may
+ * appear in the submission link list.
+ */
+static int io_init_req_group(struct io_ring_ctx *ctx, struct io_kiocb *lead,
+		int *nr, int lead_ret)
+	__must_hold(&ctx->uring_lock)
+{
+	bool more = true;
+	int cnt = 0;
+
+	lead->grp_link = NULL;
+	do {
+		const struct io_uring_sqe *sqe;
+		struct io_kiocb *req = NULL;
+		int ret = -ENOMEM;
+
+		io_alloc_req(ctx, &req);
+
+		if (unlikely(!io_get_sqe(ctx, &sqe))) {
+			if (req)
+				io_req_add_to_cache(req, ctx);
+			break;
+		}
+
+		/* one group ends with !IOSQE_EXT_SQE_GROUP */
+		if (!sqe_is_group_member(sqe))
+			more = false;
+
+		if (req) {
+			ret = io_init_req(ctx, req, sqe);
+			/*
+			 * Both IO_LINK and IO_DRAIN aren't allowed for
+			 * group member, and the boundary has to be in
+			 * the lead sqe, so the whole group shares
+			 * same IO_LINK and IO_DRAIN.
+			 */
+			if (!ret && (req->flags & (IO_REQ_LINK_FLAGS |
+							REQ_F_IO_DRAIN)))
+				ret = -EINVAL;
+			if (!more)
+				req->flags |= REQ_F_SQE_GROUP;
+			if (unlikely(ret)) {
+				/*
+				 * The lead will be failed, so don't post
+				 * CQE for any member
+				 */
+				io_req_defer_failed_sliently(req, ret);
+			} else {
+				req->grp_link = lead->grp_link;
+				lead->grp_link = req;
+			}
+		}
+		cnt += 1;
+		if (ret)
+			lead_ret = ret;
+	} while (more);
+
+	/* Mark lead if we get members, otherwise degrade it to normal req */
+	if (cnt > 0) {
+		lead->flags |= REQ_F_SQE_GROUP_LEAD;
+		atomic_set(&lead->grp_refs, cnt);
+		*nr += cnt;
+	} else {
+		lead->flags &= ~REQ_F_SQE_GROUP;
+	}
+
+	return lead_ret;
+}
+
 static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
+	if (req_is_group_lead(req)) {
+		io_complete_group_lead(req, issue_flags);
+		return;
+	}
+
 	/*
 	 * All execution paths but io-wq use the deferred completions by
 	 * passing IO_URING_F_COMPLETE_DEFER and thus should not end up here.
@@ -960,20 +1143,6 @@ static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
 	req_ref_put(req);
 }
 
-void io_req_defer_failed(struct io_kiocb *req, s32 res)
-	__must_hold(&ctx->uring_lock)
-{
-	const struct io_cold_def *def = &io_cold_defs[req->opcode];
-
-	lockdep_assert_held(&req->ctx->uring_lock);
-
-	req_set_fail(req);
-	io_req_set_res(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED));
-	if (def->fail)
-		def->fail(req);
-	io_req_complete_defer(req);
-}
-
 /*
  * Don't initialise the fields below on every allocation, but do that in
  * advance and keep them valid across allocations.
@@ -1459,7 +1628,8 @@ static void io_free_batch_list(struct io_ring_ctx *ctx,
 }
 
 static inline void io_fill_cqe_lists(struct io_ring_ctx *ctx,
-				     struct io_wq_work_list *list)
+				     struct io_wq_work_list *list,
+				     struct io_wq_work_list *grp)
 {
 	struct io_wq_work_node *node;
 
@@ -1477,6 +1647,13 @@ static inline void io_fill_cqe_lists(struct io_ring_ctx *ctx,
 				io_req_cqe_overflow(req);
 			}
 		}
+
+		if (grp && req_is_group_member(req)) {
+			struct io_kiocb *lead = io_complete_group_member(req);
+
+			if (lead)
+				wq_list_add_head(&lead->comp_list, grp);
+		}
 	}
 }
 
@@ -1484,14 +1661,19 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 	__must_hold(&ctx->uring_lock)
 {
 	struct io_submit_state *state = &ctx->submit_state;
+	struct io_wq_work_list list = {NULL, NULL};
 
 	__io_cq_lock(ctx);
-	io_fill_cqe_lists(ctx, &state->compl_reqs);
+	io_fill_cqe_lists(ctx, &state->compl_reqs, &list);
+	if (!wq_list_empty(&list))
+		io_fill_cqe_lists(ctx, &list, NULL);
 	__io_cq_unlock_post(ctx);
 
-	if (!wq_list_empty(&ctx->submit_state.compl_reqs)) {
+	if (!wq_list_empty(&state->compl_reqs)) {
 		io_free_batch_list(ctx, state->compl_reqs.first);
 		INIT_WQ_LIST(&state->compl_reqs);
+		if (!wq_list_empty(&list))
+			io_free_batch_list(ctx, list.first);
 	}
 	ctx->submit_state.cq_flush = false;
 }
@@ -2212,6 +2394,8 @@ static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 
 	*nr = 1;
 	ret = io_init_req(ctx, req, sqe);
+	if (req->flags & REQ_F_SQE_GROUP)
+		ret = io_init_req_group(ctx, req, nr, ret);
 	if (unlikely(ret))
 		return io_submit_fail_init(sqe, req, ret);
 
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 1eb65324792a..99eeb4eee219 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -104,6 +104,7 @@ bool __io_alloc_req_refill(struct io_ring_ctx *ctx);
 
 bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
 			bool cancel_all);
+void io_complete_group_lead(struct io_kiocb *req, unsigned int issue_flags);
 
 enum {
 	IO_EVENTFD_OP_SIGNAL_BIT,
@@ -113,6 +114,16 @@ enum {
 void io_eventfd_ops(struct rcu_head *rcu);
 void io_activate_pollwq(struct io_ring_ctx *ctx);
 
+static inline bool req_is_group_lead(struct io_kiocb *req)
+{
+	return req->flags & REQ_F_SQE_GROUP_LEAD;
+}
+
+static inline bool req_is_group_member(struct io_kiocb *req)
+{
+	return !req_is_group_lead(req) && (req->flags & REQ_F_SQE_GROUP);
+}
+
 static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
 {
 #if defined(CONFIG_PROVE_LOCKING)
@@ -355,7 +366,11 @@ static inline void io_req_complete_defer(struct io_kiocb *req)
 
 	lockdep_assert_held(&req->ctx->uring_lock);
 
-	wq_list_add_tail(&req->comp_list, &state->compl_reqs);
+	if (unlikely(req_is_group_lead(req)))
+		io_complete_group_lead(req, IO_URING_F_COMPLETE_DEFER |
+				IO_URING_F_NONBLOCK);
+	else
+		wq_list_add_tail(&req->comp_list, &state->compl_reqs);
 }
 
 static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx)
-- 
2.42.0


  parent reply	other threads:[~2024-04-08  1:04 UTC|newest]

Thread overview: 38+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-04-08  1:03 [RFC PATCH 0/9] io_uring: support sqe group and provide group kbuf Ming Lei
2024-04-08  1:03 ` [PATCH 1/9] io_uring: net: don't check sqe->__pad2[0] for send zc Ming Lei
2024-04-08  1:03 ` [PATCH 2/9] io_uring: support user sqe ext flags Ming Lei
2024-04-22 18:16   ` Jens Axboe
2024-04-23 13:57     ` Ming Lei
2024-04-29 15:24       ` Pavel Begunkov
2024-04-30  3:43         ` Ming Lei
2024-04-30 12:00           ` Pavel Begunkov
2024-04-30 12:56             ` Ming Lei
2024-04-30 14:10               ` Pavel Begunkov
2024-04-30 15:46                 ` Ming Lei
2024-05-02 14:22                   ` Pavel Begunkov
2024-05-04  1:19                     ` Ming Lei
2024-04-08  1:03 ` [PATCH 3/9] io_uring: add helper for filling cqes in __io_submit_flush_completions() Ming Lei
2024-04-08  1:03 ` [PATCH 4/9] io_uring: add one output argument to io_submit_sqe Ming Lei
2024-04-08  1:03 ` Ming Lei [this message]
2024-04-22 18:27   ` [PATCH 5/9] io_uring: support SQE group Jens Axboe
2024-04-23 13:08     ` Kevin Wolf
2024-04-24  1:39       ` Ming Lei
2024-04-25  9:27         ` Kevin Wolf
2024-04-26  7:53           ` Ming Lei
2024-04-26 17:05             ` Kevin Wolf
2024-04-29  3:34               ` Ming Lei
2024-04-29 15:48         ` Pavel Begunkov
2024-04-30  3:07           ` Ming Lei
2024-04-29 15:32       ` Pavel Begunkov
2024-04-30  3:03         ` Ming Lei
2024-04-30 12:27           ` Pavel Begunkov
2024-04-30 15:00             ` Ming Lei
2024-05-02 14:09               ` Pavel Begunkov
2024-05-04  1:56                 ` Ming Lei
2024-05-02 14:28               ` Pavel Begunkov
2024-04-24  0:46     ` Ming Lei
2024-04-08  1:03 ` [PATCH 6/9] io_uring: support providing sqe group buffer Ming Lei
2024-04-08  1:03 ` [PATCH 7/9] io_uring/uring_cmd: support provide group kernel buffer Ming Lei
2024-04-08  1:03 ` [PATCH 8/9] ublk: support provide io buffer Ming Lei
2024-04-08  1:03 ` [RFC PATCH 9/9] liburing: support sqe ext_flags & sqe group Ming Lei
2024-04-19  0:55 ` [RFC PATCH 0/9] io_uring: support sqe group and provide group kbuf Ming Lei

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox