public inbox for [email protected]
 help / color / mirror / Atom feed
From: Pavel Begunkov <[email protected]>
To: [email protected]
Cc: [email protected]
Subject: [RFC 1/3] io_uring: introduce request parameter sets
Date: Sun, 10 Nov 2024 14:56:20 +0000	[thread overview]
Message-ID: <877a43b660a5fec4d658007a8c77bf73471b0b64.1731205010.git.asml.silence@gmail.com> (raw)
In-Reply-To: <[email protected]>

There are lots of parameters we might want to additionally pass to a
request, but SQE has limited space and it may require additional parsing
and checking in the hot path. Then requests take an index specifying
which parameter set to use.

The benefit for the kernel is that we can put any number of arguments in
there and then do pre-processing at the initialisation time like
renumbering flags and enabling static keys for performance deprecated
features. The obvious downside is that the user can't use the entire
parameter space as there could only be a limited number of sets. The
main target here is tuning the waiting loop with finer grained control
when we should wake the task and return to the user.

The current implementation is crude, it needs a SETUP flag disabling
creds/personalities, and is limited to one registration of maximum 16
sets. It could be made to co-exist with creds and be a bit more flexibly
registered and expanded.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 include/linux/io_uring_types.h |  8 ++++++
 include/uapi/linux/io_uring.h  |  9 ++++++
 io_uring/io_uring.c            | 36 ++++++++++++++++--------
 io_uring/msg_ring.c            |  1 +
 io_uring/net.c                 |  1 +
 io_uring/register.c            | 51 ++++++++++++++++++++++++++++++++++
 6 files changed, 94 insertions(+), 12 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index ad5001102c86..79f38c07642d 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -75,6 +75,10 @@ struct io_hash_table {
 	unsigned		hash_bits;
 };
 
+struct io_set {
+	u32 flags;
+};
+
 /*
  * Arbitrary limit, can be raised if need be
  */
@@ -268,6 +272,9 @@ struct io_ring_ctx {
 		unsigned		cached_sq_head;
 		unsigned		sq_entries;
 
+		struct io_set		iosets[16];
+		unsigned int		nr_iosets;
+
 		/*
 		 * Fixed resources fast path, should be accessed only under
 		 * uring_lock, and updated through io_uring_register(2)
@@ -635,6 +642,7 @@ struct io_kiocb {
 
 	struct io_ring_ctx		*ctx;
 	struct io_uring_task		*tctx;
+	struct io_set			*ioset;
 
 	union {
 		/* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index ba373deb8406..6a432383e7c3 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -158,6 +158,8 @@ enum io_uring_sqe_flags_bit {
 #define IORING_SETUP_ATTACH_WQ	(1U << 5)	/* attach to existing wq */
 #define IORING_SETUP_R_DISABLED	(1U << 6)	/* start with ring disabled */
 #define IORING_SETUP_SUBMIT_ALL	(1U << 7)	/* continue submit on error */
+#define IORING_SETUP_IOSET	(1U << 8)
+
 /*
  * Cooperative task running. When requests complete, they often require
  * forcing the submitter to transition to the kernel to complete. If this
@@ -634,6 +636,8 @@ enum io_uring_register_op {
 	/* register fixed io_uring_reg_wait arguments */
 	IORING_REGISTER_CQWAIT_REG		= 34,
 
+	IORING_REGISTER_IOSETS			= 35,
+
 	/* this goes last */
 	IORING_REGISTER_LAST,
 
@@ -895,6 +899,11 @@ struct io_uring_recvmsg_out {
 	__u32 flags;
 };
 
+struct io_uring_ioset_reg {
+	__u64 flags;
+	__u64 __resv[3];
+};
+
 /*
  * Argument for IORING_OP_URING_CMD when file is a socket
  */
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index f34fa1ead2cf..cf688a9ff737 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2156,6 +2156,7 @@ static void io_init_req_drain(struct io_kiocb *req)
 
 static __cold int io_init_fail_req(struct io_kiocb *req, int err)
 {
+	req->ioset = &req->ctx->iosets[0];
 	/* ensure per-opcode data is cleared if we fail before prep */
 	memset(&req->cmd.data, 0, sizeof(req->cmd.data));
 	return err;
@@ -2238,19 +2239,27 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	}
 
 	personality = READ_ONCE(sqe->personality);
-	if (personality) {
-		int ret;
-
-		req->creds = xa_load(&ctx->personalities, personality);
-		if (!req->creds)
+	if (ctx->flags & IORING_SETUP_IOSET) {
+		if (unlikely(personality >= ctx->nr_iosets))
 			return io_init_fail_req(req, -EINVAL);
-		get_cred(req->creds);
-		ret = security_uring_override_creds(req->creds);
-		if (ret) {
-			put_cred(req->creds);
-			return io_init_fail_req(req, ret);
+		personality = array_index_nospec(personality, ctx->nr_iosets);
+		req->ioset = &ctx->iosets[personality];
+	} else {
+		if (personality) {
+			int ret;
+
+			req->creds = xa_load(&ctx->personalities, personality);
+			if (!req->creds)
+				return io_init_fail_req(req, -EINVAL);
+			get_cred(req->creds);
+			ret = security_uring_override_creds(req->creds);
+			if (ret) {
+				put_cred(req->creds);
+				return io_init_fail_req(req, ret);
+			}
+			req->flags |= REQ_F_CREDS;
 		}
-		req->flags |= REQ_F_CREDS;
+		req->ioset = &ctx->iosets[0];
 	}
 
 	return def->prep(req, sqe);
@@ -3909,6 +3918,8 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
 	if (!ctx)
 		return -ENOMEM;
 
+	ctx->nr_iosets = 0;
+
 	ctx->clockid = CLOCK_MONOTONIC;
 	ctx->clock_offset = 0;
 
@@ -4076,7 +4087,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
 			IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
 			IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN |
 			IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY |
-			IORING_SETUP_NO_SQARRAY | IORING_SETUP_HYBRID_IOPOLL))
+			IORING_SETUP_NO_SQARRAY | IORING_SETUP_HYBRID_IOPOLL |
+			IORING_SETUP_IOSET))
 		return -EINVAL;
 
 	return io_uring_create(entries, &p, params);
diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index e63af34004b7..f5a747aa255c 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -98,6 +98,7 @@ static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	io_req_set_res(req, res, cflags);
 	percpu_ref_get(&ctx->refs);
 	req->ctx = ctx;
+	req->ioset = &ctx->iosets[0];
 	req->io_task_work.func = io_msg_tw_complete;
 	io_req_task_work_add_remote(req, ctx, IOU_F_TWQ_LAZY_WAKE);
 	return 0;
diff --git a/io_uring/net.c b/io_uring/net.c
index 2ccc2b409431..785987bf9e6a 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -1242,6 +1242,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	notif = zc->notif = io_alloc_notif(ctx);
 	if (!notif)
 		return -ENOMEM;
+	notif->ioset = req->ioset;
 	notif->cqe.user_data = req->cqe.user_data;
 	notif->cqe.res = 0;
 	notif->cqe.flags = IORING_CQE_F_NOTIF;
diff --git a/io_uring/register.c b/io_uring/register.c
index 45edfc57963a..e7571dc46da5 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -86,6 +86,48 @@ int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
 	return -EINVAL;
 }
 
+static int io_update_ioset(struct io_ring_ctx *ctx,
+			   const struct io_uring_ioset_reg *reg,
+			   struct io_set *set)
+{
+	if (!(ctx->flags & IORING_SETUP_IOSET))
+		return -EINVAL;
+	if (reg->flags)
+		return -EINVAL;
+	if (reg->__resv[0] || reg->__resv[1] || reg->__resv[2])
+		return -EINVAL;
+
+	set->flags = reg->flags;
+	return 0;
+}
+
+static int io_register_iosets(struct io_ring_ctx *ctx,
+			      void __user *arg, unsigned int nr_args)
+{
+	struct io_uring_ioset_reg __user *uptr = arg;
+	struct io_uring_ioset_reg reg[16];
+	int i, ret;
+
+	/* TODO: one time setup, max 16 entries, should be made more dynamic */
+	if (ctx->nr_iosets)
+		return -EINVAL;
+	if (nr_args >= ARRAY_SIZE(ctx->iosets))
+		return -EINVAL;
+
+	if (copy_from_user(reg, uptr, sizeof(reg[0]) * nr_args))
+		return -EFAULT;
+
+	for (i = 0; i < nr_args; i++) {
+		ret = io_update_ioset(ctx, &reg[i], &ctx->iosets[i]);
+		if (ret) {
+			memset(&ctx->iosets[0], 0, sizeof(ctx->iosets[0]));
+			return ret;
+		}
+	}
+
+	ctx->nr_iosets = nr_args;
+	return 0;
+}
 
 static int io_register_personality(struct io_ring_ctx *ctx)
 {
@@ -93,6 +135,9 @@ static int io_register_personality(struct io_ring_ctx *ctx)
 	u32 id;
 	int ret;
 
+	if (ctx->flags & IORING_SETUP_IOSET)
+		return -EINVAL;
+
 	creds = get_current_cred();
 
 	ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
@@ -846,6 +891,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_register_cqwait_reg(ctx, arg);
 		break;
+	case IORING_REGISTER_IOSETS:
+		ret = -EINVAL;
+		if (!arg)
+			break;
+		ret = io_register_iosets(ctx, arg, nr_args);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
-- 
2.46.0


  reply	other threads:[~2024-11-10 14:55 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-11-10 14:56 [RFC 0/3] request parameter set api and wait termination tuning Pavel Begunkov
2024-11-10 14:56 ` Pavel Begunkov [this message]
2024-11-10 14:56 ` [RFC 2/3] io_uring: add support for ignoring inline completions for waits Pavel Begunkov
2024-11-10 14:56 ` [RFC 3/3] io_uring: allow waiting loop to ignore some CQEs Pavel Begunkov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=877a43b660a5fec4d658007a8c77bf73471b0b64.1731205010.git.asml.silence@gmail.com \
    [email protected] \
    [email protected] \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox