public inbox for [email protected]
 help / color / mirror / Atom feed
* [PATCHSET v3 0/14] Rewrite rsrc node handling
@ 2024-10-29 15:16 Jens Axboe
  2024-10-29 15:16 ` [PATCH 01/14] io_uring/nop: add support for testing registered files and buffers Jens Axboe
                   ` (13 more replies)
  0 siblings, 14 replies; 36+ messages in thread
From: Jens Axboe @ 2024-10-29 15:16 UTC (permalink / raw)
  To: io-uring

Hi,

Here's v3 of this patchset, it's looking pretty clean by now. For the
v2 posting, please look here:

https://lore.kernel.org/io-uring/[email protected]/T/#ma92ca3d24796b56414c68e49213bf6455002eb06

This series can also be found here:

https://git.kernel.dk/cgit/linux/log/?h=io_uring-rsrc

with previous versions in .1, and .2 postfixes of that branch.

Changes since v2:
- Don't pass in pointer to index to io_rsrc_node_lookup()
- Add io_reset_rsrc_node() helper, which cleans up some of the "lookup
  old node, if it exists, and put it" logic in various spots.
- Get rid of 'rsrc' member in buf/file union, clear the right pointer
  when the specific resource is put
- Drop unused IORING_RSRC_INVALID type
- Drop unused 'index' argument for io_rsrc_node_alloc()
- Use rsrc_empty_node consistently
- Rebase on current for-6.13 + 6.12 fixes

 include/linux/io_uring_types.h |  25 +-
 include/uapi/linux/io_uring.h  |   3 +
 io_uring/cancel.c              |   8 +-
 io_uring/fdinfo.c              |  14 +-
 io_uring/filetable.c           |  66 ++--
 io_uring/filetable.h           |  31 +-
 io_uring/io_uring.c            |  51 +--
 io_uring/msg_ring.c            |  31 +-
 io_uring/net.c                 |  15 +-
 io_uring/nop.c                 |  47 ++-
 io_uring/notif.c               |   3 +-
 io_uring/opdef.c               |   2 +
 io_uring/register.c            |   3 +-
 io_uring/rsrc.c                | 578 +++++++++++----------------------
 io_uring/rsrc.h                |  98 +++---
 io_uring/rw.c                  |  12 +-
 io_uring/splice.c              |  42 ++-
 io_uring/splice.h              |   1 +
 io_uring/uring_cmd.c           |  19 +-
 19 files changed, 414 insertions(+), 635 deletions(-)

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 36+ messages in thread

* [PATCH 01/14] io_uring/nop: add support for testing registered files and buffers
  2024-10-29 15:16 [PATCHSET v3 0/14] Rewrite rsrc node handling Jens Axboe
@ 2024-10-29 15:16 ` Jens Axboe
  2024-10-29 15:16 ` [PATCH 02/14] io_uring/rsrc: move struct io_fixed_file to rsrc.h header Jens Axboe
                   ` (12 subsequent siblings)
  13 siblings, 0 replies; 36+ messages in thread
From: Jens Axboe @ 2024-10-29 15:16 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

Useful for testing performance/efficiency impact of registered files
and buffers, vs (particularly) non-registered files.

Signed-off-by: Jens Axboe <[email protected]>
---
 include/uapi/linux/io_uring.h |  3 +++
 io_uring/nop.c                | 49 +++++++++++++++++++++++++++++++----
 2 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 65b7417c1b05..024745283783 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -416,6 +416,9 @@ enum io_uring_msg_ring_flags {
  * IORING_NOP_INJECT_RESULT	Inject result from sqe->result
  */
 #define IORING_NOP_INJECT_RESULT	(1U << 0)
+#define IORING_NOP_FILE			(1U << 1)
+#define IORING_NOP_FIXED_FILE		(1U << 2)
+#define IORING_NOP_FIXED_BUFFER		(1U << 3)
 
 /*
  * IO completion data structure (Completion Queue Entry)
diff --git a/io_uring/nop.c b/io_uring/nop.c
index a5bcf3d6984f..2c7a22ba4053 100644
--- a/io_uring/nop.c
+++ b/io_uring/nop.c
@@ -8,35 +8,74 @@
 #include <uapi/linux/io_uring.h>
 
 #include "io_uring.h"
+#include "rsrc.h"
 #include "nop.h"
 
 struct io_nop {
 	/* NOTE: kiocb has the file as the first member, so don't do it here */
 	struct file     *file;
 	int             result;
+	int		fd;
+	int		buffer;
+	unsigned int	flags;
 };
 
+#define NOP_FLAGS	(IORING_NOP_INJECT_RESULT | IORING_NOP_FIXED_FILE | \
+			 IORING_NOP_FIXED_BUFFER | IORING_NOP_FILE)
+
 int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	unsigned int flags;
 	struct io_nop *nop = io_kiocb_to_cmd(req, struct io_nop);
 
-	flags = READ_ONCE(sqe->nop_flags);
-	if (flags & ~IORING_NOP_INJECT_RESULT)
+	nop->flags = READ_ONCE(sqe->nop_flags);
+	if (nop->flags & ~NOP_FLAGS)
 		return -EINVAL;
 
-	if (flags & IORING_NOP_INJECT_RESULT)
+	if (nop->flags & IORING_NOP_INJECT_RESULT)
 		nop->result = READ_ONCE(sqe->len);
 	else
 		nop->result = 0;
+	if (nop->flags & IORING_NOP_FIXED_FILE)
+		nop->fd = READ_ONCE(sqe->fd);
+	if (nop->flags & IORING_NOP_FIXED_BUFFER)
+		nop->buffer = READ_ONCE(sqe->buf_index);
 	return 0;
 }
 
 int io_nop(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_nop *nop = io_kiocb_to_cmd(req, struct io_nop);
+	int ret = nop->result;
+
+	if (nop->flags & IORING_NOP_FILE) {
+		if (nop->flags & IORING_NOP_FIXED_FILE) {
+			req->file = io_file_get_fixed(req, nop->fd, issue_flags);
+			req->flags |= REQ_F_FIXED_FILE;
+		} else {
+			req->file = io_file_get_normal(req, nop->fd);
+		}
+		if (!req->file) {
+			ret = -EBADF;
+			goto done;
+		}
+	}
+	if (nop->flags & IORING_NOP_FIXED_BUFFER) {
+		struct io_ring_ctx *ctx = req->ctx;
+		struct io_mapped_ubuf *imu;
+		int idx;
 
-	if (nop->result < 0)
+		ret = -EFAULT;
+		io_ring_submit_lock(ctx, issue_flags);
+		if (nop->buffer < ctx->nr_user_bufs) {
+			idx = array_index_nospec(nop->buffer, ctx->nr_user_bufs);
+			imu = READ_ONCE(ctx->user_bufs[idx]);
+			io_req_set_rsrc_node(req, ctx);
+			ret = 0;
+		}
+		io_ring_submit_unlock(ctx, issue_flags);
+	}
+done:
+	if (ret < 0)
 		req_set_fail(req);
 	io_req_set_res(req, nop->result, 0);
 	return IOU_OK;
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 02/14] io_uring/rsrc: move struct io_fixed_file to rsrc.h header
  2024-10-29 15:16 [PATCHSET v3 0/14] Rewrite rsrc node handling Jens Axboe
  2024-10-29 15:16 ` [PATCH 01/14] io_uring/nop: add support for testing registered files and buffers Jens Axboe
@ 2024-10-29 15:16 ` Jens Axboe
  2024-10-29 15:16 ` [PATCH 03/14] io_uring: specify freeptr usage for SLAB_TYPESAFE_BY_RCU io_kiocb cache Jens Axboe
                   ` (11 subsequent siblings)
  13 siblings, 0 replies; 36+ messages in thread
From: Jens Axboe @ 2024-10-29 15:16 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

There's no need for this internal structure to be visible, move it to
the private rsrc.h header instead.

Signed-off-by: Jens Axboe <[email protected]>
---
 include/linux/io_uring_types.h | 5 -----
 io_uring/filetable.h           | 1 +
 io_uring/rsrc.h                | 5 +++++
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 2f12828b22a4..d4ba4ae480d6 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -55,11 +55,6 @@ struct io_wq_work {
 	int cancel_seq;
 };
 
-struct io_fixed_file {
-	/* file * with additional FFS_* flags */
-	unsigned long file_ptr;
-};
-
 struct io_file_table {
 	struct io_fixed_file *files;
 	unsigned long *bitmap;
diff --git a/io_uring/filetable.h b/io_uring/filetable.h
index b2435c4dca1f..c027ed4ad68d 100644
--- a/io_uring/filetable.h
+++ b/io_uring/filetable.h
@@ -4,6 +4,7 @@
 
 #include <linux/file.h>
 #include <linux/io_uring_types.h>
+#include "rsrc.h"
 
 bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files);
 void io_free_file_tables(struct io_file_table *table);
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index c50d4be4aa6d..e072fb3ee351 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -40,6 +40,11 @@ struct io_rsrc_node {
 	struct io_rsrc_put		item;
 };
 
+struct io_fixed_file {
+	/* file * with additional FFS_* flags */
+	unsigned long file_ptr;
+};
+
 struct io_mapped_ubuf {
 	u64		ubuf;
 	unsigned int	len;
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 03/14] io_uring: specify freeptr usage for SLAB_TYPESAFE_BY_RCU io_kiocb cache
  2024-10-29 15:16 [PATCHSET v3 0/14] Rewrite rsrc node handling Jens Axboe
  2024-10-29 15:16 ` [PATCH 01/14] io_uring/nop: add support for testing registered files and buffers Jens Axboe
  2024-10-29 15:16 ` [PATCH 02/14] io_uring/rsrc: move struct io_fixed_file to rsrc.h header Jens Axboe
@ 2024-10-29 15:16 ` Jens Axboe
  2024-11-19 15:36   ` Guenter Roeck
  2024-10-29 15:16 ` [PATCH 04/14] io_uring/splice: open code 2nd direct file assignment Jens Axboe
                   ` (10 subsequent siblings)
  13 siblings, 1 reply; 36+ messages in thread
From: Jens Axboe @ 2024-10-29 15:16 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

Doesn't matter right now as there's still some bytes left for it, but
let's prepare for the io_kiocb potentially growing and add a specific
freeptr offset for it.

Signed-off-by: Jens Axboe <[email protected]>
---
 io_uring/io_uring.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 2863b957e373..a09c67b38c1b 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3846,6 +3846,8 @@ static int __init io_uring_init(void)
 	struct kmem_cache_args kmem_args = {
 		.useroffset = offsetof(struct io_kiocb, cmd.data),
 		.usersize = sizeof_field(struct io_kiocb, cmd.data),
+		.freeptr_offset = offsetof(struct io_kiocb, work),
+		.use_freeptr_offset = true,
 	};
 
 #define __BUILD_BUG_VERIFY_OFFSET_SIZE(stype, eoffset, esize, ename) do { \
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 04/14] io_uring/splice: open code 2nd direct file assignment
  2024-10-29 15:16 [PATCHSET v3 0/14] Rewrite rsrc node handling Jens Axboe
                   ` (2 preceding siblings ...)
  2024-10-29 15:16 ` [PATCH 03/14] io_uring: specify freeptr usage for SLAB_TYPESAFE_BY_RCU io_kiocb cache Jens Axboe
@ 2024-10-29 15:16 ` Jens Axboe
  2024-10-29 15:16 ` [PATCH 05/14] io_uring/rsrc: kill io_charge_rsrc_node() Jens Axboe
                   ` (9 subsequent siblings)
  13 siblings, 0 replies; 36+ messages in thread
From: Jens Axboe @ 2024-10-29 15:16 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

In preparation for not pinning the whole registered file table, open
code the second potential direct file assignment. This will be handled
by appropriate helpers in the future, for now just do it manually.

Signed-off-by: Jens Axboe <[email protected]>
---
 io_uring/opdef.c  |  2 ++
 io_uring/splice.c | 44 ++++++++++++++++++++++++++++++++++++--------
 io_uring/splice.h |  1 +
 3 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index a2be3bbca5ff..3de75eca1c92 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -641,6 +641,7 @@ const struct io_cold_def io_cold_defs[] = {
 	},
 	[IORING_OP_SPLICE] = {
 		.name			= "SPLICE",
+		.cleanup		= io_splice_cleanup,
 	},
 	[IORING_OP_PROVIDE_BUFFERS] = {
 		.name			= "PROVIDE_BUFFERS",
@@ -650,6 +651,7 @@ const struct io_cold_def io_cold_defs[] = {
 	},
 	[IORING_OP_TEE] = {
 		.name			= "TEE",
+		.cleanup		= io_splice_cleanup,
 	},
 	[IORING_OP_SHUTDOWN] = {
 		.name			= "SHUTDOWN",
diff --git a/io_uring/splice.c b/io_uring/splice.c
index 3b659cd23e9d..e62bc6497a94 100644
--- a/io_uring/splice.c
+++ b/io_uring/splice.c
@@ -21,6 +21,7 @@ struct io_splice {
 	u64				len;
 	int				splice_fd_in;
 	unsigned int			flags;
+	struct io_rsrc_node		*rsrc_node;
 };
 
 static int __io_splice_prep(struct io_kiocb *req,
@@ -34,6 +35,7 @@ static int __io_splice_prep(struct io_kiocb *req,
 	if (unlikely(sp->flags & ~valid_flags))
 		return -EINVAL;
 	sp->splice_fd_in = READ_ONCE(sqe->splice_fd_in);
+	sp->rsrc_node = NULL;
 	req->flags |= REQ_F_FORCE_ASYNC;
 	return 0;
 }
@@ -45,6 +47,38 @@ int io_tee_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return __io_splice_prep(req, sqe);
 }
 
+void io_splice_cleanup(struct io_kiocb *req)
+{
+	struct io_splice *sp = io_kiocb_to_cmd(req, struct io_splice);
+
+	io_put_rsrc_node(req->ctx, sp->rsrc_node);
+}
+
+static struct file *io_splice_get_file(struct io_kiocb *req,
+				       unsigned int issue_flags)
+{
+	struct io_splice *sp = io_kiocb_to_cmd(req, struct io_splice);
+	struct io_ring_ctx *ctx = req->ctx;
+	struct io_fixed_file *slot;
+	struct file *file = NULL;
+
+	if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
+		return io_file_get_normal(req, sp->splice_fd_in);
+
+	io_ring_submit_lock(ctx, issue_flags);
+	if (unlikely(sp->splice_fd_in >= ctx->nr_user_files))
+		goto out;
+	sp->splice_fd_in = array_index_nospec(sp->splice_fd_in, ctx->nr_user_files);
+	slot = &ctx->file_table.files[sp->splice_fd_in];
+	if (!req->rsrc_node)
+		__io_req_set_rsrc_node(req, ctx);
+	file = io_slot_file(slot);
+	req->flags |= REQ_F_NEED_CLEANUP;
+out:
+	io_ring_submit_unlock(ctx, issue_flags);
+	return file;
+}
+
 int io_tee(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_splice *sp = io_kiocb_to_cmd(req, struct io_splice);
@@ -55,10 +89,7 @@ int io_tee(struct io_kiocb *req, unsigned int issue_flags)
 
 	WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
 
-	if (sp->flags & SPLICE_F_FD_IN_FIXED)
-		in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags);
-	else
-		in = io_file_get_normal(req, sp->splice_fd_in);
+	in = io_splice_get_file(req, issue_flags);
 	if (!in) {
 		ret = -EBADF;
 		goto done;
@@ -96,10 +127,7 @@ int io_splice(struct io_kiocb *req, unsigned int issue_flags)
 
 	WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
 
-	if (sp->flags & SPLICE_F_FD_IN_FIXED)
-		in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags);
-	else
-		in = io_file_get_normal(req, sp->splice_fd_in);
+	in = io_splice_get_file(req, issue_flags);
 	if (!in) {
 		ret = -EBADF;
 		goto done;
diff --git a/io_uring/splice.h b/io_uring/splice.h
index 542f94168ad3..b9b2848327fb 100644
--- a/io_uring/splice.h
+++ b/io_uring/splice.h
@@ -3,5 +3,6 @@
 int io_tee_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_tee(struct io_kiocb *req, unsigned int issue_flags);
 
+void io_splice_cleanup(struct io_kiocb *req);
 int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_splice(struct io_kiocb *req, unsigned int issue_flags);
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 05/14] io_uring/rsrc: kill io_charge_rsrc_node()
  2024-10-29 15:16 [PATCHSET v3 0/14] Rewrite rsrc node handling Jens Axboe
                   ` (3 preceding siblings ...)
  2024-10-29 15:16 ` [PATCH 04/14] io_uring/splice: open code 2nd direct file assignment Jens Axboe
@ 2024-10-29 15:16 ` Jens Axboe
  2024-10-29 15:16 ` [PATCH 06/14] io_uring/rsrc: get rid of per-ring io_rsrc_node list Jens Axboe
                   ` (8 subsequent siblings)
  13 siblings, 0 replies; 36+ messages in thread
From: Jens Axboe @ 2024-10-29 15:16 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

It's only used from __io_req_set_rsrc_node(), and it takes both the ctx
and node itself, while never using the ctx. Just open-code the basic
refs++ in __io_req_set_rsrc_node() instead.

Signed-off-by: Jens Axboe <[email protected]>
---
 io_uring/rsrc.h | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index e072fb3ee351..1589c9740083 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -97,18 +97,12 @@ static inline void io_put_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node
 		io_rsrc_node_ref_zero(node);
 }
 
-static inline void io_charge_rsrc_node(struct io_ring_ctx *ctx,
-				       struct io_rsrc_node *node)
-{
-	node->refs++;
-}
-
 static inline void __io_req_set_rsrc_node(struct io_kiocb *req,
 					  struct io_ring_ctx *ctx)
 {
 	lockdep_assert_held(&ctx->uring_lock);
 	req->rsrc_node = ctx->rsrc_node;
-	io_charge_rsrc_node(ctx, ctx->rsrc_node);
+	ctx->rsrc_node->refs++;
 }
 
 static inline void io_req_set_rsrc_node(struct io_kiocb *req,
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 06/14] io_uring/rsrc: get rid of per-ring io_rsrc_node list
  2024-10-29 15:16 [PATCHSET v3 0/14] Rewrite rsrc node handling Jens Axboe
                   ` (4 preceding siblings ...)
  2024-10-29 15:16 ` [PATCH 05/14] io_uring/rsrc: kill io_charge_rsrc_node() Jens Axboe
@ 2024-10-29 15:16 ` Jens Axboe
  2024-10-29 15:16 ` [PATCH 07/14] io_uring/rsrc: get rid of io_rsrc_node allocation cache Jens Axboe
                   ` (7 subsequent siblings)
  13 siblings, 0 replies; 36+ messages in thread
From: Jens Axboe @ 2024-10-29 15:16 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.

As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.

At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.

Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.

Outside of removing the stall in resource reclaim described above, it
has the following advantages:

1) It's a lot simpler than the previous scheme, and easier to follow.
   No need to specific quiesce handling anymore.

2) There are no resource node allocations in the fast path, all of that
   happens at resource registration time.

3) The structs related to resource handling can all get simplified
   quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
   go away completely.

4) Handling of resource tags is much simpler, and doesn't require
   persistent storage as it can simply get assigned up front at
   registration time. Just copy them in one-by-one at registration time
   and assign to the resource node.

The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.

With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.

Needs to get broken up a bit and there are certainly rough edges, but
that's why it's a work in progress... But it does remove a ton more code
than it adds, and passes the liburing tests.

Signed-off-by: Jens Axboe <[email protected]>
---
 include/linux/io_uring_types.h |  10 +-
 io_uring/fdinfo.c              |   2 +-
 io_uring/filetable.c           |  52 ++--
 io_uring/filetable.h           |  25 +-
 io_uring/io_uring.c            |  38 +--
 io_uring/net.c                 |  11 +-
 io_uring/nop.c                 |   6 +-
 io_uring/notif.c               |   3 +-
 io_uring/rsrc.c                | 482 ++++++++++++---------------------
 io_uring/rsrc.h                |  72 ++---
 io_uring/rw.c                  |   8 +-
 io_uring/splice.c              |  16 +-
 io_uring/uring_cmd.c           |  12 +-
 13 files changed, 272 insertions(+), 465 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index d4ba4ae480d6..42c5f2c992c4 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -56,7 +56,7 @@ struct io_wq_work {
 };
 
 struct io_file_table {
-	struct io_fixed_file *files;
+	struct io_rsrc_node **nodes;
 	unsigned long *bitmap;
 	unsigned int alloc_hint;
 };
@@ -264,7 +264,6 @@ struct io_ring_ctx {
 		 * Fixed resources fast path, should be accessed only under
 		 * uring_lock, and updated through io_uring_register(2)
 		 */
-		struct io_rsrc_node	*rsrc_node;
 		atomic_t		cancel_seq;
 
 		/*
@@ -277,7 +276,7 @@ struct io_ring_ctx {
 		struct io_wq_work_list	iopoll_list;
 
 		struct io_file_table	file_table;
-		struct io_mapped_ubuf	**user_bufs;
+		struct io_rsrc_node	**user_bufs;
 		unsigned		nr_user_files;
 		unsigned		nr_user_bufs;
 
@@ -372,10 +371,7 @@ struct io_ring_ctx {
 	struct io_rsrc_data		*buf_data;
 
 	/* protected by ->uring_lock */
-	struct list_head		rsrc_ref_list;
 	struct io_alloc_cache		rsrc_node_cache;
-	struct wait_queue_head		rsrc_quiesce_wq;
-	unsigned			rsrc_quiesce;
 
 	u32			pers_next;
 	struct xarray		personalities;
@@ -642,7 +638,7 @@ struct io_kiocb {
 		__poll_t apoll_events;
 	};
 
-	struct io_rsrc_node		*rsrc_node;
+	struct io_rsrc_node		*rsrc_nodes[2];
 
 	atomic_t			refs;
 	bool				cancel_seq_set;
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index a6bac533edbe..064a79475c5f 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -176,7 +176,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
 	}
 	seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
 	for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
-		struct io_mapped_ubuf *buf = ctx->user_bufs[i];
+		struct io_mapped_ubuf *buf = ctx->user_bufs[i]->buf;
 
 		seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, buf->len);
 	}
diff --git a/io_uring/filetable.c b/io_uring/filetable.c
index 997c56d32ee6..a01be324ac15 100644
--- a/io_uring/filetable.c
+++ b/io_uring/filetable.c
@@ -38,14 +38,14 @@ static int io_file_bitmap_get(struct io_ring_ctx *ctx)
 
 bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
 {
-	table->files = kvcalloc(nr_files, sizeof(table->files[0]),
-				GFP_KERNEL_ACCOUNT);
-	if (unlikely(!table->files))
+	table->nodes = kvmalloc_array(nr_files, sizeof(struct io_src_node *),
+					GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+	if (unlikely(!table->nodes))
 		return false;
 
 	table->bitmap = bitmap_zalloc(nr_files, GFP_KERNEL_ACCOUNT);
 	if (unlikely(!table->bitmap)) {
-		kvfree(table->files);
+		kvfree(table->nodes);
 		return false;
 	}
 
@@ -54,9 +54,9 @@ bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
 
 void io_free_file_tables(struct io_file_table *table)
 {
-	kvfree(table->files);
+	kvfree(table->nodes);
 	bitmap_free(table->bitmap);
-	table->files = NULL;
+	table->nodes = NULL;
 	table->bitmap = NULL;
 }
 
@@ -64,8 +64,7 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
 				 u32 slot_index)
 	__must_hold(&req->ctx->uring_lock)
 {
-	struct io_fixed_file *file_slot;
-	int ret;
+	struct io_rsrc_node *node;
 
 	if (io_is_uring_fops(file))
 		return -EBADF;
@@ -74,22 +73,18 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
 	if (slot_index >= ctx->nr_user_files)
 		return -EINVAL;
 
-	slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
-	file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
-
-	if (file_slot->file_ptr) {
-		ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
-					    io_slot_file(file_slot));
-		if (ret)
-			return ret;
+	node = io_rsrc_node_alloc(ctx, ctx->file_data, IORING_RSRC_FILE);
+	if (IS_ERR(node))
+		return -ENOMEM;
 
-		file_slot->file_ptr = 0;
-	} else {
+	slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
+	if (ctx->file_table.nodes[slot_index])
+		io_put_rsrc_node(ctx->file_table.nodes[slot_index]);
+	else
 		io_file_bitmap_set(&ctx->file_table, slot_index);
-	}
 
-	*io_get_tag_slot(ctx->file_data, slot_index) = 0;
-	io_fixed_file_set(file_slot, file);
+	ctx->file_table.nodes[slot_index] = node;
+	io_fixed_file_set(node, file);
 	return 0;
 }
 
@@ -134,25 +129,16 @@ int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
 
 int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset)
 {
-	struct io_fixed_file *file_slot;
-	int ret;
-
 	if (unlikely(!ctx->file_data))
 		return -ENXIO;
 	if (offset >= ctx->nr_user_files)
 		return -EINVAL;
 
 	offset = array_index_nospec(offset, ctx->nr_user_files);
-	file_slot = io_fixed_file_slot(&ctx->file_table, offset);
-	if (!file_slot->file_ptr)
+	if (!ctx->file_table.nodes[offset])
 		return -EBADF;
-
-	ret = io_queue_rsrc_removal(ctx->file_data, offset,
-				    io_slot_file(file_slot));
-	if (ret)
-		return ret;
-
-	file_slot->file_ptr = 0;
+	io_put_rsrc_node(ctx->file_table.nodes[offset]);
+	ctx->file_table.nodes[offset] = NULL;
 	io_file_bitmap_clear(&ctx->file_table, offset);
 	return 0;
 }
diff --git a/io_uring/filetable.h b/io_uring/filetable.h
index c027ed4ad68d..47616079abaa 100644
--- a/io_uring/filetable.h
+++ b/io_uring/filetable.h
@@ -34,36 +34,35 @@ static inline void io_file_bitmap_set(struct io_file_table *table, int bit)
 	table->alloc_hint = bit + 1;
 }
 
-static inline struct io_fixed_file *
-io_fixed_file_slot(struct io_file_table *table, unsigned i)
-{
-	return &table->files[i];
-}
-
 #define FFS_NOWAIT		0x1UL
 #define FFS_ISREG		0x2UL
 #define FFS_MASK		~(FFS_NOWAIT|FFS_ISREG)
 
-static inline unsigned int io_slot_flags(struct io_fixed_file *slot)
+static inline unsigned int io_slot_flags(struct io_rsrc_node *node)
 {
-	return (slot->file_ptr & ~FFS_MASK) << REQ_F_SUPPORT_NOWAIT_BIT;
+
+	return (node->file_ptr & ~FFS_MASK) << REQ_F_SUPPORT_NOWAIT_BIT;
 }
 
-static inline struct file *io_slot_file(struct io_fixed_file *slot)
+static inline struct file *io_slot_file(struct io_rsrc_node *node)
 {
-	return (struct file *)(slot->file_ptr & FFS_MASK);
+	return (struct file *)(node->file_ptr & FFS_MASK);
 }
 
 static inline struct file *io_file_from_index(struct io_file_table *table,
 					      int index)
 {
-	return io_slot_file(io_fixed_file_slot(table, index));
+	struct io_rsrc_node *node = table->nodes[index];
+
+	if (node)
+		return io_slot_file(node);
+	return NULL;
 }
 
-static inline void io_fixed_file_set(struct io_fixed_file *file_slot,
+static inline void io_fixed_file_set(struct io_rsrc_node *node,
 				     struct file *file)
 {
-	file_slot->file_ptr = (unsigned long)file |
+	node->file_ptr = (unsigned long)file |
 		(io_file_get_flags(file) >> REQ_F_SUPPORT_NOWAIT_BIT);
 }
 
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index a09c67b38c1b..0876aa74c739 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -333,7 +333,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	mutex_init(&ctx->uring_lock);
 	init_waitqueue_head(&ctx->cq_wait);
 	init_waitqueue_head(&ctx->poll_wq);
-	init_waitqueue_head(&ctx->rsrc_quiesce_wq);
 	spin_lock_init(&ctx->completion_lock);
 	spin_lock_init(&ctx->timeout_lock);
 	INIT_WQ_LIST(&ctx->iopoll_list);
@@ -341,7 +340,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_LIST_HEAD(&ctx->defer_list);
 	INIT_LIST_HEAD(&ctx->timeout_list);
 	INIT_LIST_HEAD(&ctx->ltimeout_list);
-	INIT_LIST_HEAD(&ctx->rsrc_ref_list);
 	init_llist_head(&ctx->work_llist);
 	INIT_LIST_HEAD(&ctx->tctx_list);
 	ctx->submit_state.free_list.next = NULL;
@@ -1415,7 +1413,7 @@ static void io_free_batch_list(struct io_ring_ctx *ctx,
 				io_clean_op(req);
 		}
 		io_put_file(req);
-		io_put_rsrc_node(ctx, req->rsrc_node);
+		io_req_put_rsrc_nodes(req);
 		io_put_task(req->task);
 
 		node = req->comp_list.next;
@@ -1878,7 +1876,7 @@ inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
 				      unsigned int issue_flags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-	struct io_fixed_file *slot;
+	struct io_rsrc_node *node;
 	struct file *file = NULL;
 
 	io_ring_submit_lock(ctx, issue_flags);
@@ -1886,11 +1884,12 @@ inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
 	if (unlikely((unsigned int)fd >= ctx->nr_user_files))
 		goto out;
 	fd = array_index_nospec(fd, ctx->nr_user_files);
-	slot = io_fixed_file_slot(&ctx->file_table, fd);
-	if (!req->rsrc_node)
-		__io_req_set_rsrc_node(req, ctx);
-	req->flags |= io_slot_flags(slot);
-	file = io_slot_file(slot);
+	node = ctx->file_table.nodes[fd];
+	if (node) {
+		io_req_assign_rsrc_node(req, node);
+		req->flags |= io_slot_flags(node);
+		file = io_slot_file(node);
+	}
 out:
 	io_ring_submit_unlock(ctx, issue_flags);
 	return file;
@@ -2036,7 +2035,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	req->flags = (__force io_req_flags_t) sqe_flags;
 	req->cqe.user_data = READ_ONCE(sqe->user_data);
 	req->file = NULL;
-	req->rsrc_node = NULL;
+	req->rsrc_nodes[IORING_RSRC_FILE] = NULL;
+	req->rsrc_nodes[IORING_RSRC_BUFFER] = NULL;
 	req->task = current;
 	req->cancel_seq_set = false;
 
@@ -2718,15 +2718,10 @@ static void io_req_caches_free(struct io_ring_ctx *ctx)
 static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 {
 	io_sq_thread_finish(ctx);
-	/* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
-	if (WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)))
-		return;
 
 	mutex_lock(&ctx->uring_lock);
-	if (ctx->buf_data)
-		__io_sqe_buffers_unregister(ctx);
-	if (ctx->file_data)
-		__io_sqe_files_unregister(ctx);
+	io_sqe_buffers_unregister(ctx);
+	io_sqe_files_unregister(ctx);
 	io_cqring_overflow_kill(ctx);
 	io_eventfd_unregister(ctx);
 	io_alloc_cache_free(&ctx->apoll_cache, kfree);
@@ -2743,11 +2738,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	if (ctx->submitter_task)
 		put_task_struct(ctx->submitter_task);
 
-	/* there are no registered resources left, nobody uses it */
-	if (ctx->rsrc_node)
-		io_rsrc_node_destroy(ctx, ctx->rsrc_node);
-
-	WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
 	WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
 
 	io_alloc_cache_free(&ctx->rsrc_node_cache, kfree);
@@ -3729,10 +3719,6 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
 	if (ret)
 		goto err;
 
-	ret = io_rsrc_init(ctx);
-	if (ret)
-		goto err;
-
 	p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
 			IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
 			IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
diff --git a/io_uring/net.c b/io_uring/net.c
index 2040195e33ab..ce1156551d10 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -1342,15 +1342,15 @@ static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags)
 
 	if (sr->flags & IORING_RECVSEND_FIXED_BUF) {
 		struct io_ring_ctx *ctx = req->ctx;
-		struct io_mapped_ubuf *imu;
+		struct io_rsrc_node *node;
 		int idx;
 
 		ret = -EFAULT;
 		io_ring_submit_lock(ctx, issue_flags);
 		if (sr->buf_index < ctx->nr_user_bufs) {
 			idx = array_index_nospec(sr->buf_index, ctx->nr_user_bufs);
-			imu = READ_ONCE(ctx->user_bufs[idx]);
-			io_req_set_rsrc_node(sr->notif, ctx);
+			node = ctx->user_bufs[idx];
+			io_req_assign_rsrc_node(sr->notif, node);
 			ret = 0;
 		}
 		io_ring_submit_unlock(ctx, issue_flags);
@@ -1358,8 +1358,9 @@ static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags)
 		if (unlikely(ret))
 			return ret;
 
-		ret = io_import_fixed(ITER_SOURCE, &kmsg->msg.msg_iter, imu,
-					(u64)(uintptr_t)sr->buf, sr->len);
+		ret = io_import_fixed(ITER_SOURCE, &kmsg->msg.msg_iter,
+					node->buf, (u64)(uintptr_t)sr->buf,
+					sr->len);
 		if (unlikely(ret))
 			return ret;
 		kmsg->msg.sg_from_iter = io_sg_from_iter;
diff --git a/io_uring/nop.c b/io_uring/nop.c
index 2c7a22ba4053..de91600a3bc6 100644
--- a/io_uring/nop.c
+++ b/io_uring/nop.c
@@ -61,15 +61,15 @@ int io_nop(struct io_kiocb *req, unsigned int issue_flags)
 	}
 	if (nop->flags & IORING_NOP_FIXED_BUFFER) {
 		struct io_ring_ctx *ctx = req->ctx;
-		struct io_mapped_ubuf *imu;
+		struct io_rsrc_node *node;
 		int idx;
 
 		ret = -EFAULT;
 		io_ring_submit_lock(ctx, issue_flags);
 		if (nop->buffer < ctx->nr_user_bufs) {
 			idx = array_index_nospec(nop->buffer, ctx->nr_user_bufs);
-			imu = READ_ONCE(ctx->user_bufs[idx]);
-			io_req_set_rsrc_node(req, ctx);
+			node = READ_ONCE(ctx->user_bufs[idx]);
+			io_req_assign_rsrc_node(req, node);
 			ret = 0;
 		}
 		io_ring_submit_unlock(ctx, issue_flags);
diff --git a/io_uring/notif.c b/io_uring/notif.c
index 28859ae3ee6e..4f02e969cf08 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -117,7 +117,8 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
 	notif->file = NULL;
 	notif->task = current;
 	io_get_task_refs(1);
-	notif->rsrc_node = NULL;
+	notif->rsrc_nodes[IORING_RSRC_FILE] = NULL;
+	notif->rsrc_nodes[IORING_RSRC_BUFFER] = NULL;
 
 	nd = io_notif_to_data(notif);
 	nd->zc_report = false;
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index ca2ec8a018be..e32c4d1bef86 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -26,10 +26,8 @@ struct io_rsrc_update {
 	u32				offset;
 };
 
-static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
-static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
-				  struct io_mapped_ubuf **pimu,
-				  struct page **last_hpage);
+static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
+			struct iovec *iov, int index, struct page **last_hpage);
 
 /* only define max */
 #define IORING_MAX_FIXED_FILES	(1U << 20)
@@ -110,13 +108,13 @@ static int io_buffer_validate(struct iovec *iov)
 	return 0;
 }
 
-static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
+static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
 {
-	struct io_mapped_ubuf *imu = *slot;
 	unsigned int i;
 
-	*slot = NULL;
-	if (imu != &dummy_ubuf) {
+	if (node->buf != &dummy_ubuf) {
+		struct io_mapped_ubuf *imu = node->buf;
+
 		if (!refcount_dec_and_test(&imu->refs))
 			return;
 		for (i = 0; i < imu->nr_bvecs; i++)
@@ -127,205 +125,56 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo
 	}
 }
 
-static void io_rsrc_put_work(struct io_rsrc_node *node)
+struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx,
+					struct io_rsrc_data *data, int type)
 {
-	struct io_rsrc_put *prsrc = &node->item;
-
-	if (prsrc->tag)
-		io_post_aux_cqe(node->ctx, prsrc->tag, 0, 0);
-
-	switch (node->type) {
-	case IORING_RSRC_FILE:
-		fput(prsrc->file);
-		break;
-	case IORING_RSRC_BUFFER:
-		io_rsrc_buf_put(node->ctx, prsrc);
-		break;
-	default:
-		WARN_ON_ONCE(1);
-		break;
-	}
-}
-
-void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
-{
-	if (!io_alloc_cache_put(&ctx->rsrc_node_cache, node))
-		kfree(node);
-}
-
-void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
-	__must_hold(&node->ctx->uring_lock)
-{
-	struct io_ring_ctx *ctx = node->ctx;
+	struct io_rsrc_node *node;
 
-	while (!list_empty(&ctx->rsrc_ref_list)) {
-		node = list_first_entry(&ctx->rsrc_ref_list,
-					    struct io_rsrc_node, node);
-		/* recycle ref nodes in order */
-		if (node->refs)
-			break;
-		list_del(&node->node);
-
-		if (likely(!node->empty))
-			io_rsrc_put_work(node);
-		io_rsrc_node_destroy(ctx, node);
-	}
-	if (list_empty(&ctx->rsrc_ref_list) && unlikely(ctx->rsrc_quiesce))
-		wake_up_all(&ctx->rsrc_quiesce_wq);
-}
-
-struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
-{
-	struct io_rsrc_node *ref_node;
-
-	ref_node = io_alloc_cache_get(&ctx->rsrc_node_cache);
-	if (!ref_node) {
-		ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
-		if (!ref_node)
+	node = io_alloc_cache_get(&ctx->rsrc_node_cache);
+	if (!node) {
+		node = kzalloc(sizeof(*node), GFP_KERNEL);
+		if (!node)
 			return NULL;
 	}
 
-	ref_node->ctx = ctx;
-	ref_node->empty = 0;
-	ref_node->refs = 1;
-	return ref_node;
-}
-
-__cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
-				      struct io_ring_ctx *ctx)
-{
-	struct io_rsrc_node *backup;
-	DEFINE_WAIT(we);
-	int ret;
-
-	/* As We may drop ->uring_lock, other task may have started quiesce */
-	if (data->quiesce)
-		return -ENXIO;
-
-	backup = io_rsrc_node_alloc(ctx);
-	if (!backup)
-		return -ENOMEM;
-	ctx->rsrc_node->empty = true;
-	ctx->rsrc_node->type = -1;
-	list_add_tail(&ctx->rsrc_node->node, &ctx->rsrc_ref_list);
-	io_put_rsrc_node(ctx, ctx->rsrc_node);
-	ctx->rsrc_node = backup;
-
-	if (list_empty(&ctx->rsrc_ref_list))
-		return 0;
-
-	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
-		atomic_set(&ctx->cq_wait_nr, 1);
-		smp_mb();
-	}
-
-	ctx->rsrc_quiesce++;
-	data->quiesce = true;
-	do {
-		prepare_to_wait(&ctx->rsrc_quiesce_wq, &we, TASK_INTERRUPTIBLE);
-		mutex_unlock(&ctx->uring_lock);
-
-		ret = io_run_task_work_sig(ctx);
-		if (ret < 0) {
-			finish_wait(&ctx->rsrc_quiesce_wq, &we);
-			mutex_lock(&ctx->uring_lock);
-			if (list_empty(&ctx->rsrc_ref_list))
-				ret = 0;
-			break;
-		}
-
-		schedule();
-		mutex_lock(&ctx->uring_lock);
-		ret = 0;
-	} while (!list_empty(&ctx->rsrc_ref_list));
-
-	finish_wait(&ctx->rsrc_quiesce_wq, &we);
-	data->quiesce = false;
-	ctx->rsrc_quiesce--;
-
-	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
-		atomic_set(&ctx->cq_wait_nr, 0);
-		smp_mb();
-	}
-	return ret;
-}
-
-static void io_free_page_table(void **table, size_t size)
-{
-	unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
-
-	for (i = 0; i < nr_tables; i++)
-		kfree(table[i]);
-	kfree(table);
+	node->ctx = ctx;
+	node->refs = 1;
+	node->type = type;
+	return node;
 }
 
 static void io_rsrc_data_free(struct io_rsrc_data *data)
 {
-	size_t size = data->nr * sizeof(data->tags[0][0]);
-
-	if (data->tags)
-		io_free_page_table((void **)data->tags, size);
-	kfree(data);
-}
-
-static __cold void **io_alloc_page_table(size_t size)
-{
-	unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
-	size_t init_size = size;
-	void **table;
-
-	table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
-	if (!table)
-		return NULL;
+	int i;
 
-	for (i = 0; i < nr_tables; i++) {
-		unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
+	for (i = 0; i < data->nr; i++) {
+		struct io_rsrc_node *node = data->nodes[i];
 
-		table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
-		if (!table[i]) {
-			io_free_page_table(table, init_size);
-			return NULL;
-		}
-		size -= this_size;
+		io_put_rsrc_node(node);
 	}
-	return table;
+	kvfree(data->nodes);
+	kfree(data);
 }
 
-__cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, int type,
-				     u64 __user *utags,
-				     unsigned nr, struct io_rsrc_data **pdata)
+__cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, unsigned nr,
+				     struct io_rsrc_data **pdata)
 {
 	struct io_rsrc_data *data;
-	int ret = 0;
-	unsigned i;
 
 	data = kzalloc(sizeof(*data), GFP_KERNEL);
 	if (!data)
 		return -ENOMEM;
-	data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
-	if (!data->tags) {
+
+	data->nodes = kvmalloc_array(nr, sizeof(struct io_rsrc_node *),
+					GFP_KERNEL | __GFP_ZERO);
+	if (!data->nodes) {
 		kfree(data);
 		return -ENOMEM;
 	}
 
 	data->nr = nr;
-	data->ctx = ctx;
-	data->rsrc_type = type;
-	if (utags) {
-		ret = -EFAULT;
-		for (i = 0; i < nr; i++) {
-			u64 *tag_slot = io_get_tag_slot(data, i);
-
-			if (copy_from_user(tag_slot, &utags[i],
-					   sizeof(*tag_slot)))
-				goto fail;
-		}
-	}
 	*pdata = data;
 	return 0;
-fail:
-	io_rsrc_data_free(data);
-	return ret;
 }
 
 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
@@ -334,8 +183,6 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 {
 	u64 __user *tags = u64_to_user_ptr(up->tags);
 	__s32 __user *fds = u64_to_user_ptr(up->data);
-	struct io_rsrc_data *data = ctx->file_data;
-	struct io_fixed_file *file_slot;
 	int fd, i, err = 0;
 	unsigned int done;
 
@@ -360,18 +207,14 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 			continue;
 
 		i = array_index_nospec(up->offset + done, ctx->nr_user_files);
-		file_slot = io_fixed_file_slot(&ctx->file_table, i);
-
-		if (file_slot->file_ptr) {
-			err = io_queue_rsrc_removal(data, i,
-						    io_slot_file(file_slot));
-			if (err)
-				break;
-			file_slot->file_ptr = 0;
+		if (ctx->file_table.nodes[i]) {
+			io_put_rsrc_node(ctx->file_table.nodes[i]);
+			ctx->file_table.nodes[i] = NULL;
 			io_file_bitmap_clear(&ctx->file_table, i);
 		}
 		if (fd != -1) {
 			struct file *file = fget(fd);
+			struct io_rsrc_node *node;
 
 			if (!file) {
 				err = -EBADF;
@@ -385,8 +228,15 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 				err = -EBADF;
 				break;
 			}
-			*io_get_tag_slot(data, i) = tag;
-			io_fixed_file_set(file_slot, file);
+			node = io_rsrc_node_alloc(ctx, ctx->file_data, IORING_RSRC_FILE);
+			if (!node) {
+				err = -ENOMEM;
+				fput(file);
+				break;
+			}
+			ctx->file_table.nodes[i] = node;
+			node->tag = tag;
+			io_fixed_file_set(node, file);
 			io_file_bitmap_set(&ctx->file_table, i);
 		}
 	}
@@ -411,7 +261,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
 		return -EINVAL;
 
 	for (done = 0; done < nr_args; done++) {
-		struct io_mapped_ubuf *imu;
+		struct io_rsrc_node *node;
 		u64 tag = 0;
 
 		uvec = u64_to_user_ptr(user_data);
@@ -431,23 +281,16 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
 			err = -EINVAL;
 			break;
 		}
-		err = io_sqe_buffer_register(ctx, iov, &imu, &last_hpage);
-		if (err)
-			break;
-
 		i = array_index_nospec(up->offset + done, ctx->nr_user_bufs);
-		if (ctx->user_bufs[i] != &dummy_ubuf) {
-			err = io_queue_rsrc_removal(ctx->buf_data, i,
-						    ctx->user_bufs[i]);
-			if (unlikely(err)) {
-				io_buffer_unmap(ctx, &imu);
-				break;
-			}
-			ctx->user_bufs[i] = (struct io_mapped_ubuf *)&dummy_ubuf;
+		node = io_sqe_buffer_register(ctx, iov, i, &last_hpage);
+		if (IS_ERR(node)) {
+			err = PTR_ERR(node);
+			break;
 		}
+		io_put_rsrc_node(ctx->user_bufs[i]);
 
-		ctx->user_bufs[i] = imu;
-		*io_get_tag_slot(ctx->buf_data, i) = tag;
+		ctx->user_bufs[i] = node;
+		node->tag = tag;
 		if (ctx->compat)
 			user_data += sizeof(struct compat_iovec);
 		else
@@ -622,38 +465,47 @@ int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
 	return IOU_OK;
 }
 
-int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc)
+void io_free_rsrc_node(struct io_rsrc_node *node)
 {
-	struct io_ring_ctx *ctx = data->ctx;
-	struct io_rsrc_node *node = ctx->rsrc_node;
-	u64 *tag_slot = io_get_tag_slot(data, idx);
+	struct io_ring_ctx *ctx = node->ctx;
 
-	ctx->rsrc_node = io_rsrc_node_alloc(ctx);
-	if (unlikely(!ctx->rsrc_node)) {
-		ctx->rsrc_node = node;
-		return -ENOMEM;
+	lockdep_assert_held(&ctx->uring_lock);
+
+	if (node->tag)
+		io_post_aux_cqe(node->ctx, node->tag, 0, 0);
+
+	switch (node->type) {
+	case IORING_RSRC_FILE:
+		if (io_slot_file(node))
+			fput(io_slot_file(node));
+		break;
+	case IORING_RSRC_BUFFER:
+		if (node->buf)
+			io_buffer_unmap(node->ctx, node);
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		break;
 	}
 
-	node->item.rsrc = rsrc;
-	node->type = data->rsrc_type;
-	node->item.tag = *tag_slot;
-	*tag_slot = 0;
-	list_add_tail(&node->node, &ctx->rsrc_ref_list);
-	io_put_rsrc_node(ctx, node);
-	return 0;
+	if (!io_alloc_cache_put(&ctx->rsrc_node_cache, node))
+		kfree(node);
 }
 
-void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
+static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
 {
 	int i;
 
+	lockdep_assert_held(&ctx->uring_lock);
+
 	for (i = 0; i < ctx->nr_user_files; i++) {
-		struct file *file = io_file_from_index(&ctx->file_table, i);
+		struct io_rsrc_node *node = ctx->file_table.nodes[i];
 
-		if (!file)
-			continue;
-		io_file_bitmap_clear(&ctx->file_table, i);
-		fput(file);
+		if (node) {
+			io_put_rsrc_node(node);
+			io_file_bitmap_clear(&ctx->file_table, i);
+			ctx->file_table.nodes[i] = NULL;
+		}
 	}
 
 	io_free_file_tables(&ctx->file_table);
@@ -665,22 +517,11 @@ void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
 
 int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 {
-	unsigned nr = ctx->nr_user_files;
-	int ret;
-
 	if (!ctx->file_data)
 		return -ENXIO;
 
-	/*
-	 * Quiesce may unlock ->uring_lock, and while it's not held
-	 * prevent new requests using the table.
-	 */
-	ctx->nr_user_files = 0;
-	ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
-	ctx->nr_user_files = nr;
-	if (!ret)
-		__io_sqe_files_unregister(ctx);
-	return ret;
+	__io_sqe_files_unregister(ctx);
+	return 0;
 }
 
 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
@@ -699,8 +540,7 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 		return -EMFILE;
 	if (nr_args > rlimit(RLIMIT_NOFILE))
 		return -EMFILE;
-	ret = io_rsrc_data_alloc(ctx, IORING_RSRC_FILE, tags, nr_args,
-				 &ctx->file_data);
+	ret = io_rsrc_data_alloc(ctx, nr_args, &ctx->file_data);
 	if (ret)
 		return ret;
 
@@ -711,16 +551,18 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 	}
 
 	for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
-		struct io_fixed_file *file_slot;
+		struct io_rsrc_node *node;
+		u64 tag = 0;
 
-		if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) {
-			ret = -EFAULT;
+		ret = -EFAULT;
+		if (tags && copy_from_user(&tag, &tags[i], sizeof(tag)))
+			goto fail;
+		if (fds && copy_from_user(&fd, &fds[i], sizeof(fd)))
 			goto fail;
-		}
 		/* allow sparse sets */
 		if (!fds || fd == -1) {
 			ret = -EINVAL;
-			if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
+			if (tag)
 				goto fail;
 			continue;
 		}
@@ -737,8 +579,16 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 			fput(file);
 			goto fail;
 		}
-		file_slot = io_fixed_file_slot(&ctx->file_table, i);
-		io_fixed_file_set(file_slot, file);
+		ret = -ENOMEM;
+		node = io_rsrc_node_alloc(ctx, ctx->file_data, IORING_RSRC_FILE);
+		if (!node) {
+			fput(file);
+			goto fail;
+		}
+		if (tag)
+			node->tag = tag;
+		ctx->file_table.nodes[i] = node;
+		io_fixed_file_set(node, file);
 		io_file_bitmap_set(&ctx->file_table, i);
 	}
 
@@ -750,43 +600,30 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 	return ret;
 }
 
-static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
-{
-	io_buffer_unmap(ctx, &prsrc->buf);
-	prsrc->buf = NULL;
-}
-
-void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
+static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
 {
 	unsigned int i;
 
-	for (i = 0; i < ctx->nr_user_bufs; i++)
-		io_buffer_unmap(ctx, &ctx->user_bufs[i]);
-	kfree(ctx->user_bufs);
-	io_rsrc_data_free(ctx->buf_data);
+	lockdep_assert_held(&ctx->uring_lock);
+
+	for (i = 0; i < ctx->nr_user_bufs; i++) {
+		io_put_rsrc_node(ctx->user_bufs[i]);
+		ctx->user_bufs[i] = NULL;
+	}
+	kvfree(ctx->user_bufs);
 	ctx->user_bufs = NULL;
+	io_rsrc_data_free(ctx->buf_data);
 	ctx->buf_data = NULL;
 	ctx->nr_user_bufs = 0;
 }
 
 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
 {
-	unsigned nr = ctx->nr_user_bufs;
-	int ret;
-
 	if (!ctx->buf_data)
 		return -ENXIO;
 
-	/*
-	 * Quiesce may unlock ->uring_lock, and while it's not held
-	 * prevent new requests using the table.
-	 */
-	ctx->nr_user_bufs = 0;
-	ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
-	ctx->nr_user_bufs = nr;
-	if (!ret)
-		__io_sqe_buffers_unregister(ctx);
-	return ret;
+	__io_sqe_buffers_unregister(ctx);
+	return 0;
 }
 
 /*
@@ -813,7 +650,8 @@ static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
 
 	/* check previously registered pages */
 	for (i = 0; i < ctx->nr_user_bufs; i++) {
-		struct io_mapped_ubuf *imu = ctx->user_bufs[i];
+		struct io_rsrc_node *node = ctx->user_bufs[i];
+		struct io_mapped_ubuf *imu = node->buf;
 
 		for (j = 0; j < imu->nr_bvecs; j++) {
 			if (!PageCompound(imu->bvec[j].bv_page))
@@ -950,21 +788,28 @@ static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages,
 	return io_do_coalesce_buffer(pages, nr_pages, data, nr_folios);
 }
 
-static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
-				  struct io_mapped_ubuf **pimu,
-				  struct page **last_hpage)
+static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
+						   struct iovec *iov,
+						   int index,
+						   struct page **last_hpage)
 {
 	struct io_mapped_ubuf *imu = NULL;
 	struct page **pages = NULL;
+	struct io_rsrc_node *node;
 	unsigned long off;
 	size_t size;
 	int ret, nr_pages, i;
 	struct io_imu_folio_data data;
 	bool coalesced;
 
-	*pimu = (struct io_mapped_ubuf *)&dummy_ubuf;
-	if (!iov->iov_base)
-		return 0;
+	node = io_rsrc_node_alloc(ctx, ctx->buf_data, IORING_RSRC_BUFFER);
+	if (!node)
+		return ERR_PTR(-ENOMEM);
+
+	if (!iov->iov_base) {
+		node->buf = (struct io_mapped_ubuf *) &dummy_ubuf;
+		return node;
+	}
 
 	ret = -ENOMEM;
 	pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len,
@@ -998,7 +843,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 		imu->folio_shift = data.folio_shift;
 	refcount_set(&imu->refs, 1);
 	off = (unsigned long) iov->iov_base & ((1UL << imu->folio_shift) - 1);
-	*pimu = imu;
+	node->buf = imu;
 	ret = 0;
 
 	for (i = 0; i < nr_pages; i++) {
@@ -1010,10 +855,14 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 		size -= vec_len;
 	}
 done:
-	if (ret)
+	if (ret) {
 		kvfree(imu);
+		if (node)
+			io_put_rsrc_node(node);
+		node = ERR_PTR(ret);
+	}
 	kvfree(pages);
-	return ret;
+	return node;
 }
 
 static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
@@ -1037,7 +886,7 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 		return -EBUSY;
 	if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
 		return -EINVAL;
-	ret = io_rsrc_data_alloc(ctx, IORING_RSRC_BUFFER, tags, nr_args, &data);
+	ret = io_rsrc_data_alloc(ctx, nr_args, &data);
 	if (ret)
 		return ret;
 	ret = io_buffers_map_alloc(ctx, nr_args);
@@ -1050,6 +899,9 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 		memset(iov, 0, sizeof(*iov));
 
 	for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
+		struct io_rsrc_node *node;
+		u64 tag = 0;
+
 		if (arg) {
 			uvec = (struct iovec __user *) arg;
 			iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat);
@@ -1066,15 +918,24 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 				arg += sizeof(struct iovec);
 		}
 
-		if (!iov->iov_base && *io_get_tag_slot(data, i)) {
-			ret = -EINVAL;
-			break;
+		if (tags) {
+			if (copy_from_user(&tag, &tags[i], sizeof(tag))) {
+				ret = -EFAULT;
+				break;
+			}
+			if (tag && !iov->iov_base) {
+				ret = -EINVAL;
+				break;
+			}
 		}
 
-		ret = io_sqe_buffer_register(ctx, iov, &ctx->user_bufs[i],
-					     &last_hpage);
-		if (ret)
+		node = io_sqe_buffer_register(ctx, iov, i, &last_hpage);
+		if (IS_ERR(node)) {
+			ret = PTR_ERR(node);
 			break;
+		}
+		node->tag = tag;
+		ctx->user_bufs[i] = node;
 	}
 
 	WARN_ON_ONCE(ctx->buf_data);
@@ -1148,7 +1009,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
 
 static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx)
 {
-	struct io_mapped_ubuf **user_bufs;
+	struct io_rsrc_node **user_bufs;
 	struct io_rsrc_data *data;
 	int i, ret, nbufs;
 
@@ -1163,21 +1024,31 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 	nbufs = src_ctx->nr_user_bufs;
 	if (!nbufs)
 		goto out_unlock;
-	ret = io_rsrc_data_alloc(ctx, IORING_RSRC_BUFFER, NULL, nbufs, &data);
+	ret = io_rsrc_data_alloc(ctx, nbufs, &data);
 	if (ret)
 		goto out_unlock;
 
 	ret = -ENOMEM;
-	user_bufs = kcalloc(nbufs, sizeof(*ctx->user_bufs), GFP_KERNEL);
+	user_bufs = kvmalloc_array(nbufs, sizeof(struct io_rsrc_node *),
+					GFP_KERNEL | __GFP_ZERO);
 	if (!user_bufs)
 		goto out_free_data;
 
 	for (i = 0; i < nbufs; i++) {
-		struct io_mapped_ubuf *src = src_ctx->user_bufs[i];
+		struct io_mapped_ubuf *imu = src_ctx->user_bufs[i]->buf;
+		struct io_rsrc_node *dst_node;
 
-		if (src != &dummy_ubuf)
-			refcount_inc(&src->refs);
-		user_bufs[i] = src;
+		dst_node = io_rsrc_node_alloc(ctx, data, IORING_RSRC_BUFFER);
+		if (!dst_node)
+			goto out_put_free;
+
+		if (imu == &dummy_ubuf) {
+			dst_node->buf = (struct io_mapped_ubuf *) &dummy_ubuf;
+		} else {
+			refcount_inc(&imu->refs);
+			dst_node->buf = imu;
+		}
+		user_bufs[i] = dst_node;
 	}
 
 	/* Have a ref on the bufs now, drop src lock and re-grab our own lock */
@@ -1190,12 +1061,17 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 		return 0;
 	}
 
+	mutex_unlock(&ctx->uring_lock);
+	mutex_lock(&src_ctx->uring_lock);
 	/* someone raced setting up buffers, dump ours */
-	for (i = 0; i < nbufs; i++)
-		io_buffer_unmap(ctx, &user_bufs[i]);
-	io_rsrc_data_free(data);
-	kfree(user_bufs);
-	return -EBUSY;
+	ret = -EBUSY;
+	i = nbufs;
+out_put_free:
+	while (i--) {
+		io_buffer_unmap(src_ctx, user_bufs[i]);
+		kfree(user_bufs[i]);
+	}
+	kvfree(user_bufs);
 out_free_data:
 	io_rsrc_data_free(data);
 out_unlock:
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 1589c9740083..9797dcc2a7b5 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -13,36 +13,21 @@ enum {
 	IORING_RSRC_BUFFER		= 1,
 };
 
-struct io_rsrc_put {
-	u64 tag;
-	union {
-		void *rsrc;
-		struct file *file;
-		struct io_mapped_ubuf *buf;
-	};
-};
-
 struct io_rsrc_data {
-	struct io_ring_ctx		*ctx;
-
-	u64				**tags;
 	unsigned int			nr;
-	u16				rsrc_type;
-	bool				quiesce;
+	struct io_rsrc_node		**nodes;
 };
 
 struct io_rsrc_node {
 	struct io_ring_ctx		*ctx;
 	int				refs;
-	bool				empty;
 	u16				type;
-	struct list_head		node;
-	struct io_rsrc_put		item;
-};
 
-struct io_fixed_file {
-	/* file * with additional FFS_* flags */
-	unsigned long file_ptr;
+	u64 tag;
+	union {
+		unsigned long file_ptr;
+		struct io_mapped_ubuf *buf;
+	};
 };
 
 struct io_mapped_ubuf {
@@ -63,21 +48,18 @@ struct io_imu_folio_data {
 	unsigned int	folio_shift;
 };
 
-void io_rsrc_node_ref_zero(struct io_rsrc_node *node);
-void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node);
-struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
-int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc);
+struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx,
+					struct io_rsrc_data *data, int type);
+void io_free_rsrc_node(struct io_rsrc_node *node);
 
 int io_import_fixed(int ddir, struct iov_iter *iter,
 			   struct io_mapped_ubuf *imu,
 			   u64 buf_addr, size_t len);
 
 int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg);
-void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 			    unsigned int nr_args, u64 __user *tags);
-void __io_sqe_files_unregister(struct io_ring_ctx *ctx);
 int io_sqe_files_unregister(struct io_ring_ctx *ctx);
 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 			  unsigned nr_args, u64 __user *tags);
@@ -89,41 +71,23 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
 int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
 			unsigned int size, unsigned int type);
 
-static inline void io_put_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
+static inline void io_put_rsrc_node(struct io_rsrc_node *node)
 {
-	lockdep_assert_held(&ctx->uring_lock);
-
 	if (node && !--node->refs)
-		io_rsrc_node_ref_zero(node);
-}
-
-static inline void __io_req_set_rsrc_node(struct io_kiocb *req,
-					  struct io_ring_ctx *ctx)
-{
-	lockdep_assert_held(&ctx->uring_lock);
-	req->rsrc_node = ctx->rsrc_node;
-	ctx->rsrc_node->refs++;
+		io_free_rsrc_node(node);
 }
 
-static inline void io_req_set_rsrc_node(struct io_kiocb *req,
-					struct io_ring_ctx *ctx)
+static inline void io_req_put_rsrc_nodes(struct io_kiocb *req)
 {
-	if (!req->rsrc_node)
-		__io_req_set_rsrc_node(req, ctx);
-}
-
-static inline u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx)
-{
-	unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK;
-	unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT;
-
-	return &data->tags[table_idx][off];
+	io_put_rsrc_node(req->rsrc_nodes[IORING_RSRC_FILE]);
+	io_put_rsrc_node(req->rsrc_nodes[IORING_RSRC_BUFFER]);
 }
 
-static inline int io_rsrc_init(struct io_ring_ctx *ctx)
+static inline void io_req_assign_rsrc_node(struct io_kiocb *req,
+					   struct io_rsrc_node *node)
 {
-	ctx->rsrc_node = io_rsrc_node_alloc(ctx);
-	return ctx->rsrc_node ? 0 : -ENOMEM;
+	node->refs++;
+	req->rsrc_nodes[node->type] = node;
 }
 
 int io_files_update(struct io_kiocb *req, unsigned int issue_flags);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 8080ffd6d571..65491f4f2c7e 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -330,7 +330,7 @@ static int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe
 {
 	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 	struct io_ring_ctx *ctx = req->ctx;
-	struct io_mapped_ubuf *imu;
+	struct io_rsrc_node *node;
 	struct io_async_rw *io;
 	u16 index;
 	int ret;
@@ -342,11 +342,11 @@ static int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe
 	if (unlikely(req->buf_index >= ctx->nr_user_bufs))
 		return -EFAULT;
 	index = array_index_nospec(req->buf_index, ctx->nr_user_bufs);
-	imu = ctx->user_bufs[index];
-	io_req_set_rsrc_node(req, ctx);
+	node = ctx->user_bufs[index];
+	io_req_assign_rsrc_node(req, node);
 
 	io = req->async_data;
-	ret = io_import_fixed(ddir, &io->iter, imu, rw->addr, rw->len);
+	ret = io_import_fixed(ddir, &io->iter, node->buf, rw->addr, rw->len);
 	iov_iter_save_state(&io->iter, &io->iter_state);
 	return ret;
 }
diff --git a/io_uring/splice.c b/io_uring/splice.c
index e62bc6497a94..a0b4e0435b8b 100644
--- a/io_uring/splice.c
+++ b/io_uring/splice.c
@@ -51,7 +51,7 @@ void io_splice_cleanup(struct io_kiocb *req)
 {
 	struct io_splice *sp = io_kiocb_to_cmd(req, struct io_splice);
 
-	io_put_rsrc_node(req->ctx, sp->rsrc_node);
+	io_put_rsrc_node(sp->rsrc_node);
 }
 
 static struct file *io_splice_get_file(struct io_kiocb *req,
@@ -59,7 +59,7 @@ static struct file *io_splice_get_file(struct io_kiocb *req,
 {
 	struct io_splice *sp = io_kiocb_to_cmd(req, struct io_splice);
 	struct io_ring_ctx *ctx = req->ctx;
-	struct io_fixed_file *slot;
+	struct io_rsrc_node *node;
 	struct file *file = NULL;
 
 	if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
@@ -69,11 +69,13 @@ static struct file *io_splice_get_file(struct io_kiocb *req,
 	if (unlikely(sp->splice_fd_in >= ctx->nr_user_files))
 		goto out;
 	sp->splice_fd_in = array_index_nospec(sp->splice_fd_in, ctx->nr_user_files);
-	slot = &ctx->file_table.files[sp->splice_fd_in];
-	if (!req->rsrc_node)
-		__io_req_set_rsrc_node(req, ctx);
-	file = io_slot_file(slot);
-	req->flags |= REQ_F_NEED_CLEANUP;
+	node = ctx->file_table.nodes[sp->splice_fd_in];
+	if (node) {
+		node->refs++;
+		sp->rsrc_node = node;
+		file = io_slot_file(node);
+		req->flags |= REQ_F_NEED_CLEANUP;
+	}
 out:
 	io_ring_submit_unlock(ctx, issue_flags);
 	return file;
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 6994f60d7ec7..0899c71008ae 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -220,7 +220,7 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		 * being called. This prevents destruction of the mapped buffer
 		 * we'll need at actual import time.
 		 */
-		io_req_set_rsrc_node(req, ctx);
+		io_req_assign_rsrc_node(req, ctx->user_bufs[req->buf_index]);
 	}
 	ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
 
@@ -276,15 +276,11 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
 			      struct iov_iter *iter, void *ioucmd)
 {
 	struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
-	struct io_ring_ctx *ctx = req->ctx;
+	struct io_rsrc_node *node = req->rsrc_nodes[IORING_RSRC_BUFFER];
 
 	/* Must have had rsrc_node assigned at prep time */
-	if (req->rsrc_node) {
-		struct io_mapped_ubuf *imu;
-
-		imu = READ_ONCE(ctx->user_bufs[req->buf_index]);
-		return io_import_fixed(rw, iter, imu, ubuf, len);
-	}
+	if (node)
+		return io_import_fixed(rw, iter, node->buf, ubuf, len);
 
 	return -EFAULT;
 }
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 07/14] io_uring/rsrc: get rid of io_rsrc_node allocation cache
  2024-10-29 15:16 [PATCHSET v3 0/14] Rewrite rsrc node handling Jens Axboe
                   ` (5 preceding siblings ...)
  2024-10-29 15:16 ` [PATCH 06/14] io_uring/rsrc: get rid of per-ring io_rsrc_node list Jens Axboe
@ 2024-10-29 15:16 ` Jens Axboe
  2024-10-29 15:16 ` [PATCH 08/14] io_uring/rsrc: add an empty io_rsrc_node for sparse buffer entries Jens Axboe
                   ` (6 subsequent siblings)
  13 siblings, 0 replies; 36+ messages in thread
From: Jens Axboe @ 2024-10-29 15:16 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

It's not going to be needed in the fast path going forward, so kill it
off.

Signed-off-by: Jens Axboe <[email protected]>
---
 include/linux/io_uring_types.h |  3 ---
 io_uring/io_uring.c            |  6 +-----
 io_uring/rsrc.c                | 18 ++++++------------
 3 files changed, 7 insertions(+), 20 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 42c5f2c992c4..696f2a05a98b 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -370,9 +370,6 @@ struct io_ring_ctx {
 	struct io_rsrc_data		*file_data;
 	struct io_rsrc_data		*buf_data;
 
-	/* protected by ->uring_lock */
-	struct io_alloc_cache		rsrc_node_cache;
-
 	u32			pers_next;
 	struct xarray		personalities;
 
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 0876aa74c739..094788cca47f 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -312,9 +312,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_LIST_HEAD(&ctx->sqd_list);
 	INIT_LIST_HEAD(&ctx->cq_overflow_list);
 	INIT_LIST_HEAD(&ctx->io_buffers_cache);
-	ret = io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX,
-			    sizeof(struct io_rsrc_node));
-	ret |= io_alloc_cache_init(&ctx->apoll_cache, IO_POLL_ALLOC_CACHE_MAX,
+	ret = io_alloc_cache_init(&ctx->apoll_cache, IO_POLL_ALLOC_CACHE_MAX,
 			    sizeof(struct async_poll));
 	ret |= io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX,
 			    sizeof(struct io_async_msghdr));
@@ -358,7 +356,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 free_ref:
 	percpu_ref_exit(&ctx->refs);
 err:
-	io_alloc_cache_free(&ctx->rsrc_node_cache, kfree);
 	io_alloc_cache_free(&ctx->apoll_cache, kfree);
 	io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
 	io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free);
@@ -2740,7 +2737,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 
 	WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
 
-	io_alloc_cache_free(&ctx->rsrc_node_cache, kfree);
 	if (ctx->mm_account) {
 		mmdrop(ctx->mm_account);
 		ctx->mm_account = NULL;
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index e32c4d1bef86..16e769ebca87 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -13,7 +13,6 @@
 #include <uapi/linux/io_uring.h>
 
 #include "io_uring.h"
-#include "alloc_cache.h"
 #include "openclose.h"
 #include "rsrc.h"
 #include "memmap.h"
@@ -130,16 +129,12 @@ struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx,
 {
 	struct io_rsrc_node *node;
 
-	node = io_alloc_cache_get(&ctx->rsrc_node_cache);
-	if (!node) {
-		node = kzalloc(sizeof(*node), GFP_KERNEL);
-		if (!node)
-			return NULL;
+	node = kzalloc(sizeof(*node), GFP_KERNEL);
+	if (node) {
+		node->ctx = ctx;
+		node->refs = 1;
+		node->type = type;
 	}
-
-	node->ctx = ctx;
-	node->refs = 1;
-	node->type = type;
 	return node;
 }
 
@@ -488,8 +483,7 @@ void io_free_rsrc_node(struct io_rsrc_node *node)
 		break;
 	}
 
-	if (!io_alloc_cache_put(&ctx->rsrc_node_cache, node))
-		kfree(node);
+	kfree(node);
 }
 
 static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 08/14] io_uring/rsrc: add an empty io_rsrc_node for sparse buffer entries
  2024-10-29 15:16 [PATCHSET v3 0/14] Rewrite rsrc node handling Jens Axboe
                   ` (6 preceding siblings ...)
  2024-10-29 15:16 ` [PATCH 07/14] io_uring/rsrc: get rid of io_rsrc_node allocation cache Jens Axboe
@ 2024-10-29 15:16 ` Jens Axboe
  2024-10-29 15:16 ` [PATCH 09/14] io_uring: only initialize io_kiocb rsrc_nodes when needed Jens Axboe
                   ` (5 subsequent siblings)
  13 siblings, 0 replies; 36+ messages in thread
From: Jens Axboe @ 2024-10-29 15:16 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

Rather than allocate an io_rsrc_node for an empty/sparse buffer entry,
add a const entry that can be used for that. This just needs checking
for writing the tag, and the put check needs to check for that sparse
node rather than NULL for validity.

This avoids allocating rsrc nodes for sparse buffer entries.

Signed-off-by: Jens Axboe <[email protected]>
---
 io_uring/io_uring.c |  4 ++--
 io_uring/notif.c    |  4 ++--
 io_uring/rsrc.c     | 49 ++++++++++++++++++++++++++-------------------
 io_uring/rsrc.h     | 11 +++++++---
 io_uring/splice.c   |  2 +-
 5 files changed, 41 insertions(+), 29 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 094788cca47f..9282d5fa45d3 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2032,8 +2032,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	req->flags = (__force io_req_flags_t) sqe_flags;
 	req->cqe.user_data = READ_ONCE(sqe->user_data);
 	req->file = NULL;
-	req->rsrc_nodes[IORING_RSRC_FILE] = NULL;
-	req->rsrc_nodes[IORING_RSRC_BUFFER] = NULL;
+	req->rsrc_nodes[IORING_RSRC_FILE] = rsrc_empty_node;
+	req->rsrc_nodes[IORING_RSRC_BUFFER] = rsrc_empty_node;
 	req->task = current;
 	req->cancel_seq_set = false;
 
diff --git a/io_uring/notif.c b/io_uring/notif.c
index 4f02e969cf08..44bf21c0f810 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -117,8 +117,8 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
 	notif->file = NULL;
 	notif->task = current;
 	io_get_task_refs(1);
-	notif->rsrc_nodes[IORING_RSRC_FILE] = NULL;
-	notif->rsrc_nodes[IORING_RSRC_BUFFER] = NULL;
+	notif->rsrc_nodes[IORING_RSRC_FILE] = rsrc_empty_node;
+	notif->rsrc_nodes[IORING_RSRC_BUFFER] = rsrc_empty_node;
 
 	nd = io_notif_to_data(notif);
 	nd->zc_report = false;
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 16e769ebca87..b1729cbdc749 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -38,6 +38,11 @@ static const struct io_mapped_ubuf dummy_ubuf = {
 	.len = UINT_MAX,
 };
 
+const struct io_rsrc_node empty_node = {
+	.type = IORING_RSRC_BUFFER,
+	.buf = (struct io_mapped_ubuf *) &dummy_ubuf,
+};
+
 int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
 {
 	unsigned long page_limit, cur_pages, new_pages;
@@ -145,7 +150,8 @@ static void io_rsrc_data_free(struct io_rsrc_data *data)
 	for (i = 0; i < data->nr; i++) {
 		struct io_rsrc_node *node = data->nodes[i];
 
-		io_put_rsrc_node(node);
+		if (node)
+			io_put_rsrc_node(node);
 	}
 	kvfree(data->nodes);
 	kfree(data);
@@ -230,7 +236,8 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 				break;
 			}
 			ctx->file_table.nodes[i] = node;
-			node->tag = tag;
+			if (tag)
+				node->tag = tag;
 			io_fixed_file_set(node, file);
 			io_file_bitmap_set(&ctx->file_table, i);
 		}
@@ -282,10 +289,12 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
 			err = PTR_ERR(node);
 			break;
 		}
-		io_put_rsrc_node(ctx->user_bufs[i]);
+		if (ctx->user_bufs[i])
+			io_put_rsrc_node(ctx->user_bufs[i]);
 
 		ctx->user_bufs[i] = node;
-		node->tag = tag;
+		if (tag)
+			node->tag = tag;
 		if (ctx->compat)
 			user_data += sizeof(struct compat_iovec);
 		else
@@ -601,8 +610,10 @@ static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
 	lockdep_assert_held(&ctx->uring_lock);
 
 	for (i = 0; i < ctx->nr_user_bufs; i++) {
-		io_put_rsrc_node(ctx->user_bufs[i]);
-		ctx->user_bufs[i] = NULL;
+		if (ctx->user_bufs[i]) {
+			io_put_rsrc_node(ctx->user_bufs[i]);
+			ctx->user_bufs[i] = NULL;
+		}
 	}
 	kvfree(ctx->user_bufs);
 	ctx->user_bufs = NULL;
@@ -800,11 +811,6 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
 	if (!node)
 		return ERR_PTR(-ENOMEM);
 
-	if (!iov->iov_base) {
-		node->buf = (struct io_mapped_ubuf *) &dummy_ubuf;
-		return node;
-	}
-
 	ret = -ENOMEM;
 	pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len,
 				&nr_pages);
@@ -928,7 +934,8 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 			ret = PTR_ERR(node);
 			break;
 		}
-		node->tag = tag;
+		if (tag)
+			node->tag = tag;
 		ctx->user_bufs[i] = node;
 	}
 
@@ -1029,18 +1036,18 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 		goto out_free_data;
 
 	for (i = 0; i < nbufs; i++) {
-		struct io_mapped_ubuf *imu = src_ctx->user_bufs[i]->buf;
+		struct io_rsrc_node *src_node = src_ctx->user_bufs[i];
 		struct io_rsrc_node *dst_node;
 
-		dst_node = io_rsrc_node_alloc(ctx, data, IORING_RSRC_BUFFER);
-		if (!dst_node)
-			goto out_put_free;
-
-		if (imu == &dummy_ubuf) {
-			dst_node->buf = (struct io_mapped_ubuf *) &dummy_ubuf;
+		if (src_node == rsrc_empty_node) {
+			dst_node = rsrc_empty_node;
 		} else {
-			refcount_inc(&imu->refs);
-			dst_node->buf = imu;
+			dst_node = io_rsrc_node_alloc(ctx, data, IORING_RSRC_BUFFER);
+			if (!dst_node)
+				goto out_put_free;
+
+			refcount_inc(&src_node->buf->refs);
+			dst_node->buf = src_node->buf;
 		}
 		user_bufs[i] = dst_node;
 	}
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 9797dcc2a7b5..db04d04d4799 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -71,9 +71,12 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
 int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
 			unsigned int size, unsigned int type);
 
+extern const struct io_rsrc_node empty_node;
+#define rsrc_empty_node	(struct io_rsrc_node *) &empty_node
+
 static inline void io_put_rsrc_node(struct io_rsrc_node *node)
 {
-	if (node && !--node->refs)
+	if (node != rsrc_empty_node && !--node->refs)
 		io_free_rsrc_node(node);
 }
 
@@ -86,8 +89,10 @@ static inline void io_req_put_rsrc_nodes(struct io_kiocb *req)
 static inline void io_req_assign_rsrc_node(struct io_kiocb *req,
 					   struct io_rsrc_node *node)
 {
-	node->refs++;
-	req->rsrc_nodes[node->type] = node;
+	if (node != rsrc_empty_node) {
+		node->refs++;
+		req->rsrc_nodes[node->type] = node;
+	}
 }
 
 int io_files_update(struct io_kiocb *req, unsigned int issue_flags);
diff --git a/io_uring/splice.c b/io_uring/splice.c
index a0b4e0435b8b..f78afb575ae6 100644
--- a/io_uring/splice.c
+++ b/io_uring/splice.c
@@ -35,7 +35,7 @@ static int __io_splice_prep(struct io_kiocb *req,
 	if (unlikely(sp->flags & ~valid_flags))
 		return -EINVAL;
 	sp->splice_fd_in = READ_ONCE(sqe->splice_fd_in);
-	sp->rsrc_node = NULL;
+	sp->rsrc_node = rsrc_empty_node;
 	req->flags |= REQ_F_FORCE_ASYNC;
 	return 0;
 }
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 09/14] io_uring: only initialize io_kiocb rsrc_nodes when needed
  2024-10-29 15:16 [PATCHSET v3 0/14] Rewrite rsrc node handling Jens Axboe
                   ` (7 preceding siblings ...)
  2024-10-29 15:16 ` [PATCH 08/14] io_uring/rsrc: add an empty io_rsrc_node for sparse buffer entries Jens Axboe
@ 2024-10-29 15:16 ` Jens Axboe
  2024-10-29 15:16 ` [PATCH 10/14] io_uring/rsrc: unify file and buffer resource tables Jens Axboe
                   ` (4 subsequent siblings)
  13 siblings, 0 replies; 36+ messages in thread
From: Jens Axboe @ 2024-10-29 15:16 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

Add the empty node initializing to the preinit part of the io_kiocb
allocation, and reset them if they have been used.

Signed-off-by: Jens Axboe <[email protected]>
---
 io_uring/io_uring.c |  4 ++--
 io_uring/rsrc.h     | 10 ++++++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 9282d5fa45d3..60c947114fa3 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -947,6 +947,8 @@ void io_req_defer_failed(struct io_kiocb *req, s32 res)
 static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
 {
 	req->ctx = ctx;
+	req->rsrc_nodes[IORING_RSRC_FILE] = rsrc_empty_node;
+	req->rsrc_nodes[IORING_RSRC_BUFFER] = rsrc_empty_node;
 	req->link = NULL;
 	req->async_data = NULL;
 	/* not necessary, but safer to zero */
@@ -2032,8 +2034,6 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	req->flags = (__force io_req_flags_t) sqe_flags;
 	req->cqe.user_data = READ_ONCE(sqe->user_data);
 	req->file = NULL;
-	req->rsrc_nodes[IORING_RSRC_FILE] = rsrc_empty_node;
-	req->rsrc_nodes[IORING_RSRC_BUFFER] = rsrc_empty_node;
 	req->task = current;
 	req->cancel_seq_set = false;
 
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index db04d04d4799..6a7863f13ea9 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -82,8 +82,14 @@ static inline void io_put_rsrc_node(struct io_rsrc_node *node)
 
 static inline void io_req_put_rsrc_nodes(struct io_kiocb *req)
 {
-	io_put_rsrc_node(req->rsrc_nodes[IORING_RSRC_FILE]);
-	io_put_rsrc_node(req->rsrc_nodes[IORING_RSRC_BUFFER]);
+	if (req->rsrc_nodes[IORING_RSRC_FILE] != rsrc_empty_node) {
+		io_put_rsrc_node(req->rsrc_nodes[IORING_RSRC_FILE]);
+		req->rsrc_nodes[IORING_RSRC_FILE] = rsrc_empty_node;
+	}
+	if (req->rsrc_nodes[IORING_RSRC_BUFFER] != rsrc_empty_node) {
+		io_put_rsrc_node(req->rsrc_nodes[IORING_RSRC_BUFFER]);
+		req->rsrc_nodes[IORING_RSRC_BUFFER] = rsrc_empty_node;
+	}
 }
 
 static inline void io_req_assign_rsrc_node(struct io_kiocb *req,
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 10/14] io_uring/rsrc: unify file and buffer resource tables
  2024-10-29 15:16 [PATCHSET v3 0/14] Rewrite rsrc node handling Jens Axboe
                   ` (8 preceding siblings ...)
  2024-10-29 15:16 ` [PATCH 09/14] io_uring: only initialize io_kiocb rsrc_nodes when needed Jens Axboe
@ 2024-10-29 15:16 ` Jens Axboe
  2024-10-29 15:16 ` [PATCH 11/14] io_uring/rsrc: add io_rsrc_node_lookup() helper Jens Axboe
                   ` (3 subsequent siblings)
  13 siblings, 0 replies; 36+ messages in thread
From: Jens Axboe @ 2024-10-29 15:16 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

For files, there's nr_user_files/file_table/file_data, and buffers have
nr_user_bufs/user_bufs/buf_data. There's no reason why file_table and
file_data can't be the same thing, and ditto for the buffer side. That
gets rid of more io_ring_ctx state that's in two spots rather than just
being in one spot, as it should be. Put all the registered file data in
one locations, and ditto on the buffer front.

This also avoids having both io_rsrc_data->nodes being an allocated
array, and ->user_bufs[] or ->file_table.nodes. There's no reason to
have this information duplicated. Keep it in one spot, io_rsrc_data,
along with how many resources are available.

Signed-off-by: Jens Axboe <[email protected]>
---
 include/linux/io_uring_types.h |  15 ++-
 io_uring/cancel.c              |   4 +-
 io_uring/fdinfo.c              |  10 +-
 io_uring/filetable.c           |  46 +++----
 io_uring/filetable.h           |   2 +-
 io_uring/io_uring.c            |   7 +-
 io_uring/msg_ring.c            |   4 +-
 io_uring/net.c                 |   6 +-
 io_uring/nop.c                 |   6 +-
 io_uring/register.c            |   3 +-
 io_uring/rsrc.c                | 215 +++++++++++----------------------
 io_uring/rsrc.h                |   7 +-
 io_uring/rw.c                  |   6 +-
 io_uring/splice.c              |   6 +-
 io_uring/uring_cmd.c           |   6 +-
 15 files changed, 127 insertions(+), 216 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 696f2a05a98b..77fd508d043a 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -55,8 +55,13 @@ struct io_wq_work {
 	int cancel_seq;
 };
 
+struct io_rsrc_data {
+	unsigned int			nr;
+	struct io_rsrc_node		**nodes;
+};
+
 struct io_file_table {
-	struct io_rsrc_node **nodes;
+	struct io_rsrc_data data;
 	unsigned long *bitmap;
 	unsigned int alloc_hint;
 };
@@ -276,9 +281,7 @@ struct io_ring_ctx {
 		struct io_wq_work_list	iopoll_list;
 
 		struct io_file_table	file_table;
-		struct io_rsrc_node	**user_bufs;
-		unsigned		nr_user_files;
-		unsigned		nr_user_bufs;
+		struct io_rsrc_data	buf_table;
 
 		struct io_submit_state	submit_state;
 
@@ -366,10 +369,6 @@ struct io_ring_ctx {
 	struct wait_queue_head		poll_wq;
 	struct io_restriction		restrictions;
 
-	/* slow path rsrc auxilary data, used by update/register */
-	struct io_rsrc_data		*file_data;
-	struct io_rsrc_data		*buf_data;
-
 	u32			pers_next;
 	struct xarray		personalities;
 
diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index cc3475b22ae5..3a2996307025 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -240,9 +240,9 @@ static int __io_sync_cancel(struct io_uring_task *tctx,
 	/* fixed must be grabbed every time since we drop the uring_lock */
 	if ((cd->flags & IORING_ASYNC_CANCEL_FD) &&
 	    (cd->flags & IORING_ASYNC_CANCEL_FD_FIXED)) {
-		if (unlikely(fd >= ctx->nr_user_files))
+		if (unlikely(fd >= ctx->file_table.data.nr))
 			return -EBADF;
-		fd = array_index_nospec(fd, ctx->nr_user_files);
+		fd = array_index_nospec(fd, ctx->file_table.data.nr);
 		cd->file = io_file_from_index(&ctx->file_table, fd);
 		if (!cd->file)
 			return -EBADF;
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index 064a79475c5f..e3f5e9fe5562 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -165,8 +165,8 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
 	seq_printf(m, "SqThreadCpu:\t%d\n", sq_cpu);
 	seq_printf(m, "SqTotalTime:\t%llu\n", sq_total_time);
 	seq_printf(m, "SqWorkTime:\t%llu\n", sq_work_time);
-	seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
-	for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
+	seq_printf(m, "UserFiles:\t%u\n", ctx->file_table.data.nr);
+	for (i = 0; has_lock && i < ctx->file_table.data.nr; i++) {
 		struct file *f = io_file_from_index(&ctx->file_table, i);
 
 		if (f)
@@ -174,9 +174,9 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
 		else
 			seq_printf(m, "%5u: <none>\n", i);
 	}
-	seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
-	for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
-		struct io_mapped_ubuf *buf = ctx->user_bufs[i]->buf;
+	seq_printf(m, "UserBufs:\t%u\n", ctx->buf_table.nr);
+	for (i = 0; has_lock && i < ctx->buf_table.nr; i++) {
+		struct io_mapped_ubuf *buf = ctx->buf_table.nodes[i]->buf;
 
 		seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, buf->len);
 	}
diff --git a/io_uring/filetable.c b/io_uring/filetable.c
index a01be324ac15..c1bea2d9dce2 100644
--- a/io_uring/filetable.c
+++ b/io_uring/filetable.c
@@ -38,25 +38,19 @@ static int io_file_bitmap_get(struct io_ring_ctx *ctx)
 
 bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
 {
-	table->nodes = kvmalloc_array(nr_files, sizeof(struct io_src_node *),
-					GFP_KERNEL_ACCOUNT | __GFP_ZERO);
-	if (unlikely(!table->nodes))
+	if (io_rsrc_data_alloc(&table->data, nr_files))
 		return false;
-
 	table->bitmap = bitmap_zalloc(nr_files, GFP_KERNEL_ACCOUNT);
-	if (unlikely(!table->bitmap)) {
-		kvfree(table->nodes);
-		return false;
-	}
-
-	return true;
+	if (table->bitmap)
+		return true;
+	io_rsrc_data_free(&table->data);
+	return false;
 }
 
 void io_free_file_tables(struct io_file_table *table)
 {
-	kvfree(table->nodes);
+	io_rsrc_data_free(&table->data);
 	bitmap_free(table->bitmap);
-	table->nodes = NULL;
 	table->bitmap = NULL;
 }
 
@@ -68,22 +62,22 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
 
 	if (io_is_uring_fops(file))
 		return -EBADF;
-	if (!ctx->file_data)
+	if (!ctx->file_table.data.nr)
 		return -ENXIO;
-	if (slot_index >= ctx->nr_user_files)
+	if (slot_index >= ctx->file_table.data.nr)
 		return -EINVAL;
 
-	node = io_rsrc_node_alloc(ctx, ctx->file_data, IORING_RSRC_FILE);
+	node = io_rsrc_node_alloc(ctx, &ctx->file_table.data, IORING_RSRC_FILE);
 	if (IS_ERR(node))
 		return -ENOMEM;
 
-	slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
-	if (ctx->file_table.nodes[slot_index])
-		io_put_rsrc_node(ctx->file_table.nodes[slot_index]);
+	slot_index = array_index_nospec(slot_index, ctx->file_table.data.nr);
+	if (ctx->file_table.data.nodes[slot_index])
+		io_put_rsrc_node(ctx->file_table.data.nodes[slot_index]);
 	else
 		io_file_bitmap_set(&ctx->file_table, slot_index);
 
-	ctx->file_table.nodes[slot_index] = node;
+	ctx->file_table.data.nodes[slot_index] = node;
 	io_fixed_file_set(node, file);
 	return 0;
 }
@@ -129,16 +123,16 @@ int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
 
 int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset)
 {
-	if (unlikely(!ctx->file_data))
+	if (unlikely(!ctx->file_table.data.nr))
 		return -ENXIO;
-	if (offset >= ctx->nr_user_files)
+	if (offset >= ctx->file_table.data.nr)
 		return -EINVAL;
 
-	offset = array_index_nospec(offset, ctx->nr_user_files);
-	if (!ctx->file_table.nodes[offset])
+	offset = array_index_nospec(offset, ctx->file_table.data.nr);
+	if (!ctx->file_table.data.nodes[offset])
 		return -EBADF;
-	io_put_rsrc_node(ctx->file_table.nodes[offset]);
-	ctx->file_table.nodes[offset] = NULL;
+	io_put_rsrc_node(ctx->file_table.data.nodes[offset]);
+	ctx->file_table.data.nodes[offset] = NULL;
 	io_file_bitmap_clear(&ctx->file_table, offset);
 	return 0;
 }
@@ -153,7 +147,7 @@ int io_register_file_alloc_range(struct io_ring_ctx *ctx,
 		return -EFAULT;
 	if (check_add_overflow(range.off, range.len, &end))
 		return -EOVERFLOW;
-	if (range.resv || end > ctx->nr_user_files)
+	if (range.resv || end > ctx->file_table.data.nr)
 		return -EINVAL;
 
 	io_file_table_set_alloc_range(ctx, range.off, range.len);
diff --git a/io_uring/filetable.h b/io_uring/filetable.h
index 47616079abaa..664c31502dbb 100644
--- a/io_uring/filetable.h
+++ b/io_uring/filetable.h
@@ -52,7 +52,7 @@ static inline struct file *io_slot_file(struct io_rsrc_node *node)
 static inline struct file *io_file_from_index(struct io_file_table *table,
 					      int index)
 {
-	struct io_rsrc_node *node = table->nodes[index];
+	struct io_rsrc_node *node = table->data.nodes[index];
 
 	if (node)
 		return io_slot_file(node);
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 60c947114fa3..78df515fb3a7 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1879,11 +1879,10 @@ inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
 	struct file *file = NULL;
 
 	io_ring_submit_lock(ctx, issue_flags);
-
-	if (unlikely((unsigned int)fd >= ctx->nr_user_files))
+	if (unlikely((unsigned int)fd >= ctx->file_table.data.nr))
 		goto out;
-	fd = array_index_nospec(fd, ctx->nr_user_files);
-	node = ctx->file_table.nodes[fd];
+	fd = array_index_nospec(fd, ctx->file_table.data.nr);
+	node = ctx->file_table.data.nodes[fd];
 	if (node) {
 		io_req_assign_rsrc_node(req, node);
 		req->flags |= io_slot_flags(node);
diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index edea1ffd501c..b90ab3b8f5e0 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -180,8 +180,8 @@ static struct file *io_msg_grab_file(struct io_kiocb *req, unsigned int issue_fl
 	int idx = msg->src_fd;
 
 	io_ring_submit_lock(ctx, issue_flags);
-	if (likely(idx < ctx->nr_user_files)) {
-		idx = array_index_nospec(idx, ctx->nr_user_files);
+	if (likely(idx < ctx->file_table.data.nr)) {
+		idx = array_index_nospec(idx, ctx->file_table.data.nr);
 		file = io_file_from_index(&ctx->file_table, idx);
 		if (file)
 			get_file(file);
diff --git a/io_uring/net.c b/io_uring/net.c
index ce1156551d10..3e1f31574abb 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -1347,9 +1347,9 @@ static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags)
 
 		ret = -EFAULT;
 		io_ring_submit_lock(ctx, issue_flags);
-		if (sr->buf_index < ctx->nr_user_bufs) {
-			idx = array_index_nospec(sr->buf_index, ctx->nr_user_bufs);
-			node = ctx->user_bufs[idx];
+		if (sr->buf_index < ctx->buf_table.nr) {
+			idx = array_index_nospec(sr->buf_index, ctx->buf_table.nr);
+			node = ctx->buf_table.nodes[idx];
 			io_req_assign_rsrc_node(sr->notif, node);
 			ret = 0;
 		}
diff --git a/io_uring/nop.c b/io_uring/nop.c
index de91600a3bc6..0dac01127de5 100644
--- a/io_uring/nop.c
+++ b/io_uring/nop.c
@@ -66,9 +66,9 @@ int io_nop(struct io_kiocb *req, unsigned int issue_flags)
 
 		ret = -EFAULT;
 		io_ring_submit_lock(ctx, issue_flags);
-		if (nop->buffer < ctx->nr_user_bufs) {
-			idx = array_index_nospec(nop->buffer, ctx->nr_user_bufs);
-			node = READ_ONCE(ctx->user_bufs[idx]);
+		if (nop->buffer < ctx->buf_table.nr) {
+			idx = array_index_nospec(nop->buffer, ctx->buf_table.nr);
+			node = READ_ONCE(ctx->buf_table.nodes[idx]);
 			io_req_assign_rsrc_node(req, node);
 			ret = 0;
 		}
diff --git a/io_uring/register.c b/io_uring/register.c
index 1eb686eaa310..45edfc57963a 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -937,7 +937,8 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
 	mutex_lock(&ctx->uring_lock);
 	ret = __io_uring_register(ctx, opcode, arg, nr_args);
 	mutex_unlock(&ctx->uring_lock);
-	trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
+	trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr,
+				ctx->buf_table.nr, ret);
 	if (!use_registered_ring)
 		fput(file);
 	return ret;
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index b1729cbdc749..902e003704a9 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -143,39 +143,28 @@ struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx,
 	return node;
 }
 
-static void io_rsrc_data_free(struct io_rsrc_data *data)
+__cold void io_rsrc_data_free(struct io_rsrc_data *data)
 {
-	int i;
-
-	for (i = 0; i < data->nr; i++) {
-		struct io_rsrc_node *node = data->nodes[i];
-
-		if (node)
-			io_put_rsrc_node(node);
+	if (!data->nr)
+		return;
+	while (data->nr--) {
+		if (data->nodes[data->nr])
+			io_put_rsrc_node(data->nodes[data->nr]);
 	}
 	kvfree(data->nodes);
-	kfree(data);
+	data->nodes = NULL;
+	data->nr = 0;
 }
 
-__cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, unsigned nr,
-				     struct io_rsrc_data **pdata)
+__cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr)
 {
-	struct io_rsrc_data *data;
-
-	data = kzalloc(sizeof(*data), GFP_KERNEL);
-	if (!data)
-		return -ENOMEM;
-
 	data->nodes = kvmalloc_array(nr, sizeof(struct io_rsrc_node *),
-					GFP_KERNEL | __GFP_ZERO);
-	if (!data->nodes) {
-		kfree(data);
-		return -ENOMEM;
+					GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+	if (data->nodes) {
+		data->nr = nr;
+		return 0;
 	}
-
-	data->nr = nr;
-	*pdata = data;
-	return 0;
+	return -ENOMEM;
 }
 
 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
@@ -187,9 +176,9 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 	int fd, i, err = 0;
 	unsigned int done;
 
-	if (!ctx->file_data)
+	if (!ctx->file_table.data.nr)
 		return -ENXIO;
-	if (up->offset + nr_args > ctx->nr_user_files)
+	if (up->offset + nr_args > ctx->file_table.data.nr)
 		return -EINVAL;
 
 	for (done = 0; done < nr_args; done++) {
@@ -207,10 +196,10 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 		if (fd == IORING_REGISTER_FILES_SKIP)
 			continue;
 
-		i = array_index_nospec(up->offset + done, ctx->nr_user_files);
-		if (ctx->file_table.nodes[i]) {
-			io_put_rsrc_node(ctx->file_table.nodes[i]);
-			ctx->file_table.nodes[i] = NULL;
+		i = array_index_nospec(up->offset + done, ctx->file_table.data.nr);
+		if (ctx->file_table.data.nodes[i]) {
+			io_put_rsrc_node(ctx->file_table.data.nodes[i]);
+			ctx->file_table.data.nodes[i] = NULL;
 			io_file_bitmap_clear(&ctx->file_table, i);
 		}
 		if (fd != -1) {
@@ -229,13 +218,14 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 				err = -EBADF;
 				break;
 			}
-			node = io_rsrc_node_alloc(ctx, ctx->file_data, IORING_RSRC_FILE);
+			node = io_rsrc_node_alloc(ctx, &ctx->file_table.data,
+						  IORING_RSRC_FILE);
 			if (!node) {
 				err = -ENOMEM;
 				fput(file);
 				break;
 			}
-			ctx->file_table.nodes[i] = node;
+			ctx->file_table.data.nodes[i] = node;
 			if (tag)
 				node->tag = tag;
 			io_fixed_file_set(node, file);
@@ -257,9 +247,9 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
 	__u32 done;
 	int i, err;
 
-	if (!ctx->buf_data)
+	if (!ctx->buf_table.nr)
 		return -ENXIO;
-	if (up->offset + nr_args > ctx->nr_user_bufs)
+	if (up->offset + nr_args > ctx->buf_table.nr)
 		return -EINVAL;
 
 	for (done = 0; done < nr_args; done++) {
@@ -283,16 +273,16 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
 			err = -EINVAL;
 			break;
 		}
-		i = array_index_nospec(up->offset + done, ctx->nr_user_bufs);
+		i = array_index_nospec(up->offset + done, ctx->buf_table.nr);
 		node = io_sqe_buffer_register(ctx, iov, i, &last_hpage);
 		if (IS_ERR(node)) {
 			err = PTR_ERR(node);
 			break;
 		}
-		if (ctx->user_bufs[i])
-			io_put_rsrc_node(ctx->user_bufs[i]);
+		if (ctx->buf_table.nodes[i])
+			io_put_rsrc_node(ctx->buf_table.nodes[i]);
 
-		ctx->user_bufs[i] = node;
+		ctx->buf_table.nodes[i] = node;
 		if (tag)
 			node->tag = tag;
 		if (ctx->compat)
@@ -410,7 +400,7 @@ static int io_files_update_with_index_alloc(struct io_kiocb *req,
 	struct file *file;
 	int ret, fd;
 
-	if (!req->ctx->file_data)
+	if (!req->ctx->file_table.data.nr)
 		return -ENXIO;
 
 	for (done = 0; done < up->nr_args; done++) {
@@ -495,35 +485,13 @@ void io_free_rsrc_node(struct io_rsrc_node *node)
 	kfree(node);
 }
 
-static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
-{
-	int i;
-
-	lockdep_assert_held(&ctx->uring_lock);
-
-	for (i = 0; i < ctx->nr_user_files; i++) {
-		struct io_rsrc_node *node = ctx->file_table.nodes[i];
-
-		if (node) {
-			io_put_rsrc_node(node);
-			io_file_bitmap_clear(&ctx->file_table, i);
-			ctx->file_table.nodes[i] = NULL;
-		}
-	}
-
-	io_free_file_tables(&ctx->file_table);
-	io_file_table_set_alloc_range(ctx, 0, 0);
-	io_rsrc_data_free(ctx->file_data);
-	ctx->file_data = NULL;
-	ctx->nr_user_files = 0;
-}
-
 int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 {
-	if (!ctx->file_data)
+	if (!ctx->file_table.data.nr)
 		return -ENXIO;
 
-	__io_sqe_files_unregister(ctx);
+	io_free_file_tables(&ctx->file_table);
+	io_file_table_set_alloc_range(ctx, 0, 0);
 	return 0;
 }
 
@@ -535,7 +503,7 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 	int fd, ret;
 	unsigned i;
 
-	if (ctx->file_data)
+	if (ctx->file_table.data.nr)
 		return -EBUSY;
 	if (!nr_args)
 		return -EINVAL;
@@ -543,17 +511,10 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 		return -EMFILE;
 	if (nr_args > rlimit(RLIMIT_NOFILE))
 		return -EMFILE;
-	ret = io_rsrc_data_alloc(ctx, nr_args, &ctx->file_data);
-	if (ret)
-		return ret;
-
-	if (!io_alloc_file_tables(&ctx->file_table, nr_args)) {
-		io_rsrc_data_free(ctx->file_data);
-		ctx->file_data = NULL;
+	if (!io_alloc_file_tables(&ctx->file_table, nr_args))
 		return -ENOMEM;
-	}
 
-	for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
+	for (i = 0; i < nr_args; i++) {
 		struct io_rsrc_node *node;
 		u64 tag = 0;
 
@@ -583,51 +544,32 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 			goto fail;
 		}
 		ret = -ENOMEM;
-		node = io_rsrc_node_alloc(ctx, ctx->file_data, IORING_RSRC_FILE);
+		node = io_rsrc_node_alloc(ctx, &ctx->file_table.data,
+					  IORING_RSRC_FILE);
 		if (!node) {
 			fput(file);
 			goto fail;
 		}
 		if (tag)
 			node->tag = tag;
-		ctx->file_table.nodes[i] = node;
+		ctx->file_table.data.nodes[i] = node;
 		io_fixed_file_set(node, file);
 		io_file_bitmap_set(&ctx->file_table, i);
 	}
 
 	/* default it to the whole table */
-	io_file_table_set_alloc_range(ctx, 0, ctx->nr_user_files);
+	io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr);
 	return 0;
 fail:
-	__io_sqe_files_unregister(ctx);
+	io_sqe_files_unregister(ctx);
 	return ret;
 }
 
-static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
-{
-	unsigned int i;
-
-	lockdep_assert_held(&ctx->uring_lock);
-
-	for (i = 0; i < ctx->nr_user_bufs; i++) {
-		if (ctx->user_bufs[i]) {
-			io_put_rsrc_node(ctx->user_bufs[i]);
-			ctx->user_bufs[i] = NULL;
-		}
-	}
-	kvfree(ctx->user_bufs);
-	ctx->user_bufs = NULL;
-	io_rsrc_data_free(ctx->buf_data);
-	ctx->buf_data = NULL;
-	ctx->nr_user_bufs = 0;
-}
-
 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
 {
-	if (!ctx->buf_data)
+	if (!ctx->buf_table.nr)
 		return -ENXIO;
-
-	__io_sqe_buffers_unregister(ctx);
+	io_rsrc_data_free(&ctx->buf_table);
 	return 0;
 }
 
@@ -654,8 +596,8 @@ static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
 	}
 
 	/* check previously registered pages */
-	for (i = 0; i < ctx->nr_user_bufs; i++) {
-		struct io_rsrc_node *node = ctx->user_bufs[i];
+	for (i = 0; i < ctx->buf_table.nr; i++) {
+		struct io_rsrc_node *node = ctx->buf_table.nodes[i];
 		struct io_mapped_ubuf *imu = node->buf;
 
 		for (j = 0; j < imu->nr_bvecs; j++) {
@@ -807,7 +749,10 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
 	struct io_imu_folio_data data;
 	bool coalesced;
 
-	node = io_rsrc_node_alloc(ctx, ctx->buf_data, IORING_RSRC_BUFFER);
+	if (!iov->iov_base)
+		return rsrc_empty_node;
+
+	node = io_rsrc_node_alloc(ctx, &ctx->buf_table, IORING_RSRC_BUFFER);
 	if (!node)
 		return ERR_PTR(-ENOMEM);
 
@@ -865,40 +810,29 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
 	return node;
 }
 
-static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
-{
-	ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
-	return ctx->user_bufs ? 0 : -ENOMEM;
-}
-
 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 			    unsigned int nr_args, u64 __user *tags)
 {
 	struct page *last_hpage = NULL;
-	struct io_rsrc_data *data;
+	struct io_rsrc_data data;
 	struct iovec fast_iov, *iov = &fast_iov;
 	const struct iovec __user *uvec;
 	int i, ret;
 
 	BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
 
-	if (ctx->user_bufs)
+	if (ctx->buf_table.nr)
 		return -EBUSY;
 	if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
 		return -EINVAL;
-	ret = io_rsrc_data_alloc(ctx, nr_args, &data);
+	ret = io_rsrc_data_alloc(&data, nr_args);
 	if (ret)
 		return ret;
-	ret = io_buffers_map_alloc(ctx, nr_args);
-	if (ret) {
-		io_rsrc_data_free(data);
-		return ret;
-	}
 
 	if (!arg)
 		memset(iov, 0, sizeof(*iov));
 
-	for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
+	for (i = 0; i < nr_args; i++) {
 		struct io_rsrc_node *node;
 		u64 tag = 0;
 
@@ -936,14 +870,12 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 		}
 		if (tag)
 			node->tag = tag;
-		ctx->user_bufs[i] = node;
+		data.nodes[i] = node;
 	}
 
-	WARN_ON_ONCE(ctx->buf_data);
-
-	ctx->buf_data = data;
+	ctx->buf_table = data;
 	if (ret)
-		__io_sqe_buffers_unregister(ctx);
+		io_sqe_buffers_unregister(ctx);
 	return ret;
 }
 
@@ -1010,8 +942,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
 
 static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx)
 {
-	struct io_rsrc_node **user_bufs;
-	struct io_rsrc_data *data;
+	struct io_rsrc_data data;
 	int i, ret, nbufs;
 
 	/*
@@ -1022,43 +953,35 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 
 	mutex_lock(&src_ctx->uring_lock);
 	ret = -ENXIO;
-	nbufs = src_ctx->nr_user_bufs;
+	nbufs = src_ctx->buf_table.nr;
 	if (!nbufs)
 		goto out_unlock;
-	ret = io_rsrc_data_alloc(ctx, nbufs, &data);
+	ret = io_rsrc_data_alloc(&data, nbufs);
 	if (ret)
 		goto out_unlock;
 
-	ret = -ENOMEM;
-	user_bufs = kvmalloc_array(nbufs, sizeof(struct io_rsrc_node *),
-					GFP_KERNEL | __GFP_ZERO);
-	if (!user_bufs)
-		goto out_free_data;
-
 	for (i = 0; i < nbufs; i++) {
-		struct io_rsrc_node *src_node = src_ctx->user_bufs[i];
+		struct io_rsrc_node *src_node = src_ctx->buf_table.nodes[i];
 		struct io_rsrc_node *dst_node;
 
 		if (src_node == rsrc_empty_node) {
 			dst_node = rsrc_empty_node;
 		} else {
-			dst_node = io_rsrc_node_alloc(ctx, data, IORING_RSRC_BUFFER);
+			dst_node = io_rsrc_node_alloc(ctx, &data, IORING_RSRC_BUFFER);
 			if (!dst_node)
 				goto out_put_free;
 
 			refcount_inc(&src_node->buf->refs);
 			dst_node->buf = src_node->buf;
 		}
-		user_bufs[i] = dst_node;
+		data.nodes[i] = dst_node;
 	}
 
 	/* Have a ref on the bufs now, drop src lock and re-grab our own lock */
 	mutex_unlock(&src_ctx->uring_lock);
 	mutex_lock(&ctx->uring_lock);
-	if (!ctx->user_bufs) {
-		ctx->user_bufs = user_bufs;
-		ctx->buf_data = data;
-		ctx->nr_user_bufs = nbufs;
+	if (!ctx->buf_table.nr) {
+		ctx->buf_table = data;
 		return 0;
 	}
 
@@ -1069,12 +992,10 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 	i = nbufs;
 out_put_free:
 	while (i--) {
-		io_buffer_unmap(src_ctx, user_bufs[i]);
-		kfree(user_bufs[i]);
+		io_buffer_unmap(src_ctx, data.nodes[i]);
+		kfree(data.nodes[i]);
 	}
-	kvfree(user_bufs);
-out_free_data:
-	io_rsrc_data_free(data);
+	io_rsrc_data_free(&data);
 out_unlock:
 	mutex_unlock(&src_ctx->uring_lock);
 	mutex_lock(&ctx->uring_lock);
@@ -1095,7 +1016,7 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
 	struct file *file;
 	int ret;
 
-	if (ctx->user_bufs || ctx->nr_user_bufs)
+	if (ctx->buf_table.nr)
 		return -EBUSY;
 	if (copy_from_user(&buf, arg, sizeof(buf)))
 		return -EFAULT;
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 6a7863f13ea9..438e0ac6abf7 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -13,11 +13,6 @@ enum {
 	IORING_RSRC_BUFFER		= 1,
 };
 
-struct io_rsrc_data {
-	unsigned int			nr;
-	struct io_rsrc_node		**nodes;
-};
-
 struct io_rsrc_node {
 	struct io_ring_ctx		*ctx;
 	int				refs;
@@ -51,6 +46,8 @@ struct io_imu_folio_data {
 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx,
 					struct io_rsrc_data *data, int type);
 void io_free_rsrc_node(struct io_rsrc_node *node);
+void io_rsrc_data_free(struct io_rsrc_data *data);
+int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr);
 
 int io_import_fixed(int ddir, struct iov_iter *iter,
 			   struct io_mapped_ubuf *imu,
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 65491f4f2c7e..28fff18ebb19 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -339,10 +339,10 @@ static int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe
 	if (unlikely(ret))
 		return ret;
 
-	if (unlikely(req->buf_index >= ctx->nr_user_bufs))
+	if (unlikely(req->buf_index >= ctx->buf_table.nr))
 		return -EFAULT;
-	index = array_index_nospec(req->buf_index, ctx->nr_user_bufs);
-	node = ctx->user_bufs[index];
+	index = array_index_nospec(req->buf_index, ctx->buf_table.nr);
+	node = ctx->buf_table.nodes[index];
 	io_req_assign_rsrc_node(req, node);
 
 	io = req->async_data;
diff --git a/io_uring/splice.c b/io_uring/splice.c
index f78afb575ae6..aaaddb66e90a 100644
--- a/io_uring/splice.c
+++ b/io_uring/splice.c
@@ -66,10 +66,10 @@ static struct file *io_splice_get_file(struct io_kiocb *req,
 		return io_file_get_normal(req, sp->splice_fd_in);
 
 	io_ring_submit_lock(ctx, issue_flags);
-	if (unlikely(sp->splice_fd_in >= ctx->nr_user_files))
+	if (unlikely(sp->splice_fd_in >= ctx->file_table.data.nr))
 		goto out;
-	sp->splice_fd_in = array_index_nospec(sp->splice_fd_in, ctx->nr_user_files);
-	node = ctx->file_table.nodes[sp->splice_fd_in];
+	sp->splice_fd_in = array_index_nospec(sp->splice_fd_in, ctx->file_table.data.nr);
+	node = ctx->file_table.data.nodes[sp->splice_fd_in];
 	if (node) {
 		node->refs++;
 		sp->rsrc_node = node;
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 0899c71008ae..17d5f5004702 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -212,15 +212,15 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		u16 index;
 
 		index = READ_ONCE(sqe->buf_index);
-		if (unlikely(index >= ctx->nr_user_bufs))
+		if (unlikely(index >= ctx->buf_table.nr))
 			return -EFAULT;
-		req->buf_index = array_index_nospec(index, ctx->nr_user_bufs);
+		req->buf_index = array_index_nospec(index, ctx->buf_table.nr);
 		/*
 		 * Pi node upfront, prior to io_uring_cmd_import_fixed()
 		 * being called. This prevents destruction of the mapped buffer
 		 * we'll need at actual import time.
 		 */
-		io_req_assign_rsrc_node(req, ctx->user_bufs[req->buf_index]);
+		io_req_assign_rsrc_node(req, ctx->buf_table.nodes[req->buf_index]);
 	}
 	ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
 
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 11/14] io_uring/rsrc: add io_rsrc_node_lookup() helper
  2024-10-29 15:16 [PATCHSET v3 0/14] Rewrite rsrc node handling Jens Axboe
                   ` (9 preceding siblings ...)
  2024-10-29 15:16 ` [PATCH 10/14] io_uring/rsrc: unify file and buffer resource tables Jens Axboe
@ 2024-10-29 15:16 ` Jens Axboe
  2024-10-29 15:16 ` [PATCH 12/14] io_uring/filetable: remove io_file_from_index() helper Jens Axboe
                   ` (2 subsequent siblings)
  13 siblings, 0 replies; 36+ messages in thread
From: Jens Axboe @ 2024-10-29 15:16 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

There are lots of spots open-coding this functionality, add a generic
helper that does the node lookup in a speculation safe way.

Signed-off-by: Jens Axboe <[email protected]>
---
 io_uring/cancel.c    |  8 +++++---
 io_uring/filetable.c | 16 +++++++++-------
 io_uring/filetable.h |  2 +-
 io_uring/io_uring.c  |  6 +-----
 io_uring/msg_ring.c  | 31 +++++++++++++++----------------
 io_uring/net.c       |  6 ++----
 io_uring/nop.c       |  6 ++----
 io_uring/rsrc.c      | 12 +++++++-----
 io_uring/rsrc.h      |  8 ++++++++
 io_uring/rw.c        |  6 ++----
 io_uring/splice.c    |  6 +-----
 io_uring/uring_cmd.c |  9 ++++-----
 12 files changed, 57 insertions(+), 59 deletions(-)

diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index 3a2996307025..bbca5cb69cb5 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -240,10 +240,12 @@ static int __io_sync_cancel(struct io_uring_task *tctx,
 	/* fixed must be grabbed every time since we drop the uring_lock */
 	if ((cd->flags & IORING_ASYNC_CANCEL_FD) &&
 	    (cd->flags & IORING_ASYNC_CANCEL_FD_FIXED)) {
-		if (unlikely(fd >= ctx->file_table.data.nr))
+		struct io_rsrc_node *node;
+
+		node = io_rsrc_node_lookup(&ctx->file_table.data, fd);
+		if (unlikely(!node))
 			return -EBADF;
-		fd = array_index_nospec(fd, ctx->file_table.data.nr);
-		cd->file = io_file_from_index(&ctx->file_table, fd);
+		cd->file = io_slot_file(node);
 		if (!cd->file)
 			return -EBADF;
 	}
diff --git a/io_uring/filetable.c b/io_uring/filetable.c
index c1bea2d9dce2..1f22f183cdeb 100644
--- a/io_uring/filetable.c
+++ b/io_uring/filetable.c
@@ -58,7 +58,7 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
 				 u32 slot_index)
 	__must_hold(&req->ctx->uring_lock)
 {
-	struct io_rsrc_node *node;
+	struct io_rsrc_node *node, *old_node;
 
 	if (io_is_uring_fops(file))
 		return -EBADF;
@@ -71,9 +71,9 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
 	if (IS_ERR(node))
 		return -ENOMEM;
 
-	slot_index = array_index_nospec(slot_index, ctx->file_table.data.nr);
-	if (ctx->file_table.data.nodes[slot_index])
-		io_put_rsrc_node(ctx->file_table.data.nodes[slot_index]);
+	old_node = io_rsrc_node_lookup(&ctx->file_table.data, slot_index);
+	if (old_node)
+		io_put_rsrc_node(old_node);
 	else
 		io_file_bitmap_set(&ctx->file_table, slot_index);
 
@@ -123,15 +123,17 @@ int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
 
 int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset)
 {
+	struct io_rsrc_node *node;
+
 	if (unlikely(!ctx->file_table.data.nr))
 		return -ENXIO;
 	if (offset >= ctx->file_table.data.nr)
 		return -EINVAL;
 
-	offset = array_index_nospec(offset, ctx->file_table.data.nr);
-	if (!ctx->file_table.data.nodes[offset])
+	node = io_rsrc_node_lookup(&ctx->file_table.data, offset);
+	if (!node)
 		return -EBADF;
-	io_put_rsrc_node(ctx->file_table.data.nodes[offset]);
+	io_put_rsrc_node(node);
 	ctx->file_table.data.nodes[offset] = NULL;
 	io_file_bitmap_clear(&ctx->file_table, offset);
 	return 0;
diff --git a/io_uring/filetable.h b/io_uring/filetable.h
index 664c31502dbb..29edda0caa65 100644
--- a/io_uring/filetable.h
+++ b/io_uring/filetable.h
@@ -52,7 +52,7 @@ static inline struct file *io_slot_file(struct io_rsrc_node *node)
 static inline struct file *io_file_from_index(struct io_file_table *table,
 					      int index)
 {
-	struct io_rsrc_node *node = table->data.nodes[index];
+	struct io_rsrc_node *node = io_rsrc_node_lookup(&table->data, index);
 
 	if (node)
 		return io_slot_file(node);
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 78df515fb3a7..3a535e9e8ac3 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1879,16 +1879,12 @@ inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
 	struct file *file = NULL;
 
 	io_ring_submit_lock(ctx, issue_flags);
-	if (unlikely((unsigned int)fd >= ctx->file_table.data.nr))
-		goto out;
-	fd = array_index_nospec(fd, ctx->file_table.data.nr);
-	node = ctx->file_table.data.nodes[fd];
+	node = io_rsrc_node_lookup(&ctx->file_table.data, fd);
 	if (node) {
 		io_req_assign_rsrc_node(req, node);
 		req->flags |= io_slot_flags(node);
 		file = io_slot_file(node);
 	}
-out:
 	io_ring_submit_unlock(ctx, issue_flags);
 	return file;
 }
diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index b90ab3b8f5e0..99af39e1d0fb 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -172,22 +172,24 @@ static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags)
 	return __io_msg_ring_data(target_ctx, msg, issue_flags);
 }
 
-static struct file *io_msg_grab_file(struct io_kiocb *req, unsigned int issue_flags)
+static int io_msg_grab_file(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
 	struct io_ring_ctx *ctx = req->ctx;
-	struct file *file = NULL;
-	int idx = msg->src_fd;
+	struct io_rsrc_node *node;
+	int ret = -EBADF;
 
 	io_ring_submit_lock(ctx, issue_flags);
-	if (likely(idx < ctx->file_table.data.nr)) {
-		idx = array_index_nospec(idx, ctx->file_table.data.nr);
-		file = io_file_from_index(&ctx->file_table, idx);
-		if (file)
-			get_file(file);
+	node = io_rsrc_node_lookup(&ctx->file_table.data, msg->src_fd);
+	if (node) {
+		msg->src_file = io_slot_file(node);
+		if (msg->src_file)
+			get_file(msg->src_file);
+		req->flags |= REQ_F_NEED_CLEANUP;
+		ret = 0;
 	}
 	io_ring_submit_unlock(ctx, issue_flags);
-	return file;
+	return ret;
 }
 
 static int io_msg_install_complete(struct io_kiocb *req, unsigned int issue_flags)
@@ -256,7 +258,6 @@ static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags)
 	struct io_ring_ctx *target_ctx = req->file->private_data;
 	struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
 	struct io_ring_ctx *ctx = req->ctx;
-	struct file *src_file = msg->src_file;
 
 	if (msg->len)
 		return -EINVAL;
@@ -264,12 +265,10 @@ static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags)
 		return -EINVAL;
 	if (target_ctx->flags & IORING_SETUP_R_DISABLED)
 		return -EBADFD;
-	if (!src_file) {
-		src_file = io_msg_grab_file(req, issue_flags);
-		if (!src_file)
-			return -EBADF;
-		msg->src_file = src_file;
-		req->flags |= REQ_F_NEED_CLEANUP;
+	if (!msg->src_file) {
+		int ret = io_msg_grab_file(req, issue_flags);
+		if (unlikely(ret))
+			return ret;
 	}
 
 	if (io_msg_need_remote(target_ctx))
diff --git a/io_uring/net.c b/io_uring/net.c
index 3e1f31574abb..2f7b334ed708 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -1343,13 +1343,11 @@ static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags)
 	if (sr->flags & IORING_RECVSEND_FIXED_BUF) {
 		struct io_ring_ctx *ctx = req->ctx;
 		struct io_rsrc_node *node;
-		int idx;
 
 		ret = -EFAULT;
 		io_ring_submit_lock(ctx, issue_flags);
-		if (sr->buf_index < ctx->buf_table.nr) {
-			idx = array_index_nospec(sr->buf_index, ctx->buf_table.nr);
-			node = ctx->buf_table.nodes[idx];
+		node = io_rsrc_node_lookup(&ctx->buf_table, sr->buf_index);
+		if (node) {
 			io_req_assign_rsrc_node(sr->notif, node);
 			ret = 0;
 		}
diff --git a/io_uring/nop.c b/io_uring/nop.c
index 0dac01127de5..149dbdc53607 100644
--- a/io_uring/nop.c
+++ b/io_uring/nop.c
@@ -62,13 +62,11 @@ int io_nop(struct io_kiocb *req, unsigned int issue_flags)
 	if (nop->flags & IORING_NOP_FIXED_BUFFER) {
 		struct io_ring_ctx *ctx = req->ctx;
 		struct io_rsrc_node *node;
-		int idx;
 
 		ret = -EFAULT;
 		io_ring_submit_lock(ctx, issue_flags);
-		if (nop->buffer < ctx->buf_table.nr) {
-			idx = array_index_nospec(nop->buffer, ctx->buf_table.nr);
-			node = READ_ONCE(ctx->buf_table.nodes[idx]);
+		node = io_rsrc_node_lookup(&ctx->buf_table, nop->buffer);
+		if (node) {
 			io_req_assign_rsrc_node(req, node);
 			ret = 0;
 		}
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 902e003704a9..0924c53dd954 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -182,6 +182,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 		return -EINVAL;
 
 	for (done = 0; done < nr_args; done++) {
+		struct io_rsrc_node *node;
 		u64 tag = 0;
 
 		if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
@@ -196,9 +197,10 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 		if (fd == IORING_REGISTER_FILES_SKIP)
 			continue;
 
-		i = array_index_nospec(up->offset + done, ctx->file_table.data.nr);
-		if (ctx->file_table.data.nodes[i]) {
-			io_put_rsrc_node(ctx->file_table.data.nodes[i]);
+		i = up->offset + done;
+		node = io_rsrc_node_lookup(&ctx->file_table.data, i);
+		if (node) {
+			io_put_rsrc_node(node);
 			ctx->file_table.data.nodes[i] = NULL;
 			io_file_bitmap_clear(&ctx->file_table, i);
 		}
@@ -961,9 +963,9 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 		goto out_unlock;
 
 	for (i = 0; i < nbufs; i++) {
-		struct io_rsrc_node *src_node = src_ctx->buf_table.nodes[i];
-		struct io_rsrc_node *dst_node;
+		struct io_rsrc_node *dst_node, *src_node;
 
+		src_node = io_rsrc_node_lookup(&src_ctx->buf_table, i);
 		if (src_node == rsrc_empty_node) {
 			dst_node = rsrc_empty_node;
 		} else {
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 438e0ac6abf7..6952fb45f57a 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -71,6 +71,14 @@ int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
 extern const struct io_rsrc_node empty_node;
 #define rsrc_empty_node	(struct io_rsrc_node *) &empty_node
 
+static inline struct io_rsrc_node *io_rsrc_node_lookup(struct io_rsrc_data *data,
+						       int index)
+{
+	if (index < data->nr)
+		return data->nodes[array_index_nospec(index, data->nr)];
+	return NULL;
+}
+
 static inline void io_put_rsrc_node(struct io_rsrc_node *node)
 {
 	if (node != rsrc_empty_node && !--node->refs)
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 28fff18ebb19..30448f343c7f 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -332,17 +332,15 @@ static int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_rsrc_node *node;
 	struct io_async_rw *io;
-	u16 index;
 	int ret;
 
 	ret = io_prep_rw(req, sqe, ddir, false);
 	if (unlikely(ret))
 		return ret;
 
-	if (unlikely(req->buf_index >= ctx->buf_table.nr))
+	node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index);
+	if (!node)
 		return -EFAULT;
-	index = array_index_nospec(req->buf_index, ctx->buf_table.nr);
-	node = ctx->buf_table.nodes[index];
 	io_req_assign_rsrc_node(req, node);
 
 	io = req->async_data;
diff --git a/io_uring/splice.c b/io_uring/splice.c
index aaaddb66e90a..deeb8bb18651 100644
--- a/io_uring/splice.c
+++ b/io_uring/splice.c
@@ -66,17 +66,13 @@ static struct file *io_splice_get_file(struct io_kiocb *req,
 		return io_file_get_normal(req, sp->splice_fd_in);
 
 	io_ring_submit_lock(ctx, issue_flags);
-	if (unlikely(sp->splice_fd_in >= ctx->file_table.data.nr))
-		goto out;
-	sp->splice_fd_in = array_index_nospec(sp->splice_fd_in, ctx->file_table.data.nr);
-	node = ctx->file_table.data.nodes[sp->splice_fd_in];
+	node = io_rsrc_node_lookup(&ctx->file_table.data, sp->splice_fd_in);
 	if (node) {
 		node->refs++;
 		sp->rsrc_node = node;
 		file = io_slot_file(node);
 		req->flags |= REQ_F_NEED_CLEANUP;
 	}
-out:
 	io_ring_submit_unlock(ctx, issue_flags);
 	return file;
 }
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 17d5f5004702..535909a38e76 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -209,18 +209,17 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 	if (ioucmd->flags & IORING_URING_CMD_FIXED) {
 		struct io_ring_ctx *ctx = req->ctx;
-		u16 index;
+		struct io_rsrc_node *node;
 
-		index = READ_ONCE(sqe->buf_index);
-		if (unlikely(index >= ctx->buf_table.nr))
+		node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index);
+		if (unlikely(!node))
 			return -EFAULT;
-		req->buf_index = array_index_nospec(index, ctx->buf_table.nr);
 		/*
 		 * Pi node upfront, prior to io_uring_cmd_import_fixed()
 		 * being called. This prevents destruction of the mapped buffer
 		 * we'll need at actual import time.
 		 */
-		io_req_assign_rsrc_node(req, ctx->buf_table.nodes[req->buf_index]);
+		io_req_assign_rsrc_node(req, node);
 	}
 	ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
 
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 12/14] io_uring/filetable: remove io_file_from_index() helper
  2024-10-29 15:16 [PATCHSET v3 0/14] Rewrite rsrc node handling Jens Axboe
                   ` (10 preceding siblings ...)
  2024-10-29 15:16 ` [PATCH 11/14] io_uring/rsrc: add io_rsrc_node_lookup() helper Jens Axboe
@ 2024-10-29 15:16 ` Jens Axboe
  2024-10-29 15:16 ` [PATCH 13/14] io_uring/filetable: kill io_reset_alloc_hint() helper Jens Axboe
  2024-10-29 15:16 ` [PATCH 14/14] io_uring/rsrc: add io_reset_rsrc_node() helper Jens Axboe
  13 siblings, 0 replies; 36+ messages in thread
From: Jens Axboe @ 2024-10-29 15:16 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

It's only used in fdinfo, nothing really gained from having this helper.

Signed-off-by: Jens Axboe <[email protected]>
---
 io_uring/fdinfo.c    |  4 +++-
 io_uring/filetable.h | 10 ----------
 2 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index e3f5e9fe5562..9d96481e2eb6 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -167,8 +167,10 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
 	seq_printf(m, "SqWorkTime:\t%llu\n", sq_work_time);
 	seq_printf(m, "UserFiles:\t%u\n", ctx->file_table.data.nr);
 	for (i = 0; has_lock && i < ctx->file_table.data.nr; i++) {
-		struct file *f = io_file_from_index(&ctx->file_table, i);
+		struct file *f = NULL;
 
+		if (ctx->file_table.data.nodes[i])
+			f = io_slot_file(ctx->file_table.data.nodes[i]);
 		if (f)
 			seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
 		else
diff --git a/io_uring/filetable.h b/io_uring/filetable.h
index 29edda0caa65..6c0c9642f6e9 100644
--- a/io_uring/filetable.h
+++ b/io_uring/filetable.h
@@ -49,16 +49,6 @@ static inline struct file *io_slot_file(struct io_rsrc_node *node)
 	return (struct file *)(node->file_ptr & FFS_MASK);
 }
 
-static inline struct file *io_file_from_index(struct io_file_table *table,
-					      int index)
-{
-	struct io_rsrc_node *node = io_rsrc_node_lookup(&table->data, index);
-
-	if (node)
-		return io_slot_file(node);
-	return NULL;
-}
-
 static inline void io_fixed_file_set(struct io_rsrc_node *node,
 				     struct file *file)
 {
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 13/14] io_uring/filetable: kill io_reset_alloc_hint() helper
  2024-10-29 15:16 [PATCHSET v3 0/14] Rewrite rsrc node handling Jens Axboe
                   ` (11 preceding siblings ...)
  2024-10-29 15:16 ` [PATCH 12/14] io_uring/filetable: remove io_file_from_index() helper Jens Axboe
@ 2024-10-29 15:16 ` Jens Axboe
  2024-10-29 15:16 ` [PATCH 14/14] io_uring/rsrc: add io_reset_rsrc_node() helper Jens Axboe
  13 siblings, 0 replies; 36+ messages in thread
From: Jens Axboe @ 2024-10-29 15:16 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

It's only used internally, and in one spot, just open-code ti.

Signed-off-by: Jens Axboe <[email protected]>
---
 io_uring/filetable.h | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/io_uring/filetable.h b/io_uring/filetable.h
index 6c0c9642f6e9..bfacadb8d089 100644
--- a/io_uring/filetable.h
+++ b/io_uring/filetable.h
@@ -56,17 +56,12 @@ static inline void io_fixed_file_set(struct io_rsrc_node *node,
 		(io_file_get_flags(file) >> REQ_F_SUPPORT_NOWAIT_BIT);
 }
 
-static inline void io_reset_alloc_hint(struct io_ring_ctx *ctx)
-{
-	ctx->file_table.alloc_hint = ctx->file_alloc_start;
-}
-
 static inline void io_file_table_set_alloc_range(struct io_ring_ctx *ctx,
 						 unsigned off, unsigned len)
 {
 	ctx->file_alloc_start = off;
 	ctx->file_alloc_end = off + len;
-	io_reset_alloc_hint(ctx);
+	ctx->file_table.alloc_hint = ctx->file_alloc_start;
 }
 
 #endif
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 14/14] io_uring/rsrc: add io_reset_rsrc_node() helper
  2024-10-29 15:16 [PATCHSET v3 0/14] Rewrite rsrc node handling Jens Axboe
                   ` (12 preceding siblings ...)
  2024-10-29 15:16 ` [PATCH 13/14] io_uring/filetable: kill io_reset_alloc_hint() helper Jens Axboe
@ 2024-10-29 15:16 ` Jens Axboe
  13 siblings, 0 replies; 36+ messages in thread
From: Jens Axboe @ 2024-10-29 15:16 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

Puts and reset an existing node in a slot, if one exists. Returns true
if a node was there, false if not. This helps cleanup some of the code
that does a lookup just to clear an existing node.

Signed-off-by: Jens Axboe <[email protected]>
---
 io_uring/filetable.c | 10 +++-------
 io_uring/rsrc.c      | 12 +++---------
 io_uring/rsrc.h      | 11 +++++++++++
 3 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/io_uring/filetable.c b/io_uring/filetable.c
index 1f22f183cdeb..717d5b806781 100644
--- a/io_uring/filetable.c
+++ b/io_uring/filetable.c
@@ -58,7 +58,7 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
 				 u32 slot_index)
 	__must_hold(&req->ctx->uring_lock)
 {
-	struct io_rsrc_node *node, *old_node;
+	struct io_rsrc_node *node;
 
 	if (io_is_uring_fops(file))
 		return -EBADF;
@@ -71,10 +71,7 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
 	if (IS_ERR(node))
 		return -ENOMEM;
 
-	old_node = io_rsrc_node_lookup(&ctx->file_table.data, slot_index);
-	if (old_node)
-		io_put_rsrc_node(old_node);
-	else
+	if (!io_reset_rsrc_node(&ctx->file_table.data, slot_index))
 		io_file_bitmap_set(&ctx->file_table, slot_index);
 
 	ctx->file_table.data.nodes[slot_index] = node;
@@ -133,8 +130,7 @@ int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset)
 	node = io_rsrc_node_lookup(&ctx->file_table.data, offset);
 	if (!node)
 		return -EBADF;
-	io_put_rsrc_node(node);
-	ctx->file_table.data.nodes[offset] = NULL;
+	io_reset_rsrc_node(&ctx->file_table.data, offset);
 	io_file_bitmap_clear(&ctx->file_table, offset);
 	return 0;
 }
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 0924c53dd954..97673771a0fb 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -182,7 +182,6 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 		return -EINVAL;
 
 	for (done = 0; done < nr_args; done++) {
-		struct io_rsrc_node *node;
 		u64 tag = 0;
 
 		if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
@@ -198,12 +197,9 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 			continue;
 
 		i = up->offset + done;
-		node = io_rsrc_node_lookup(&ctx->file_table.data, i);
-		if (node) {
-			io_put_rsrc_node(node);
-			ctx->file_table.data.nodes[i] = NULL;
+		if (io_reset_rsrc_node(&ctx->file_table.data, i))
 			io_file_bitmap_clear(&ctx->file_table, i);
-		}
+
 		if (fd != -1) {
 			struct file *file = fget(fd);
 			struct io_rsrc_node *node;
@@ -281,9 +277,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
 			err = PTR_ERR(node);
 			break;
 		}
-		if (ctx->buf_table.nodes[i])
-			io_put_rsrc_node(ctx->buf_table.nodes[i]);
-
+		io_reset_rsrc_node(&ctx->buf_table, i);
 		ctx->buf_table.nodes[i] = node;
 		if (tag)
 			node->tag = tag;
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 6952fb45f57a..abd214f303f5 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -85,6 +85,17 @@ static inline void io_put_rsrc_node(struct io_rsrc_node *node)
 		io_free_rsrc_node(node);
 }
 
+static inline bool io_reset_rsrc_node(struct io_rsrc_data *data, int index)
+{
+	struct io_rsrc_node *node = data->nodes[index];
+
+	if (!node)
+		return false;
+	io_put_rsrc_node(node);
+	data->nodes[index] = NULL;
+	return true;
+}
+
 static inline void io_req_put_rsrc_nodes(struct io_kiocb *req)
 {
 	if (req->rsrc_nodes[IORING_RSRC_FILE] != rsrc_empty_node) {
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH 03/14] io_uring: specify freeptr usage for SLAB_TYPESAFE_BY_RCU io_kiocb cache
  2024-10-29 15:16 ` [PATCH 03/14] io_uring: specify freeptr usage for SLAB_TYPESAFE_BY_RCU io_kiocb cache Jens Axboe
@ 2024-11-19 15:36   ` Guenter Roeck
  2024-11-19 16:02     ` Jens Axboe
  0 siblings, 1 reply; 36+ messages in thread
From: Guenter Roeck @ 2024-11-19 15:36 UTC (permalink / raw)
  To: Jens Axboe; +Cc: io-uring

Hi,

On Tue, Oct 29, 2024 at 09:16:32AM -0600, Jens Axboe wrote:
> Doesn't matter right now as there's still some bytes left for it, but
> let's prepare for the io_kiocb potentially growing and add a specific
> freeptr offset for it.
> 
> Signed-off-by: Jens Axboe <[email protected]>

This patch triggers:

Kernel panic - not syncing: __kmem_cache_create_args: Failed to create slab 'io_kiocb'. Error -22
CPU: 0 UID: 0 PID: 1 Comm: swapper Not tainted 6.12.0-mac-00971-g158f238aa69d #1
Stack from 00c63e5c:
        00c63e5c 00612c1c 00612c1c 00000300 00000001 005f3ce6 004b9044 00612c1c
        004ae21e 00000310 000000b6 005f3ce6 005f3ce6 ffffffea ffffffea 00797244
        00c63f20 000c6974 005ee588 004c9051 005f3ce6 ffffffea 000000a5 00c614a0
        004a72c2 0002cb62 000c675e 004adb58 0076f28a 005f3ce6 000000b6 00c63ef4
        00000310 00c63ef4 00000000 00000016 0076f23e 00c63f4c 00000010 00000004
        00000038 0000009a 01000000 00000000 00000000 00000000 000020e0 0076f23e
Call Trace: [<004b9044>] dump_stack+0xc/0x10
 [<004ae21e>] panic+0xc4/0x252
 [<000c6974>] __kmem_cache_create_args+0x216/0x26c
 [<004a72c2>] strcpy+0x0/0x1c
 [<0002cb62>] parse_args+0x0/0x1f2
 [<000c675e>] __kmem_cache_create_args+0x0/0x26c
 [<004adb58>] memset+0x0/0x8c
 [<0076f28a>] io_uring_init+0x4c/0xca
 [<0076f23e>] io_uring_init+0x0/0xca
 [<000020e0>] do_one_initcall+0x32/0x192
 [<0076f23e>] io_uring_init+0x0/0xca
 [<0000211c>] do_one_initcall+0x6e/0x192
 [<004a72c2>] strcpy+0x0/0x1c
 [<0002cb62>] parse_args+0x0/0x1f2
 [<000020ae>] do_one_initcall+0x0/0x192
 [<0075c4e2>] kernel_init_freeable+0x1a0/0x1a4
 [<0076f23e>] io_uring_init+0x0/0xca
 [<004b911a>] kernel_init+0x0/0xec
 [<004b912e>] kernel_init+0x14/0xec
 [<004b911a>] kernel_init+0x0/0xec
 [<0000252c>] ret_from_kernel_thread+0xc/0x14

when trying to boot the m68k:q800 machine in qemu.

An added debug message in create_cache() shows the reason:

#### freeptr_offset=154 object_size=182 flags=0x310 aligned=0 sizeof(freeptr_t)=4

freeptr_offset would need to be 4-byte aligned but that is not the case on m68k.

Bisect log attached.

Guenter

---
# bad: [158f238aa69d91ad74e535c73f552bd4b025109c] Merge tag 'for-linus-6.13-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip
# good: [adc218676eef25575469234709c2d87185ca223a] Linux 6.12
git bisect start '158f238aa69d' 'v6.12'
# good: [77a0cfafa9af9c0d5b43534eb90d530c189edca1] Merge tag 'for-6.13/block-20241118' of git://git.kernel.dk/linux
git bisect good 77a0cfafa9af9c0d5b43534eb90d530c189edca1
# bad: [0338cd9c22d1bce7dc4a6641d4215a50f476f429] Merge tag 's390-6.13-1' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux
git bisect bad 0338cd9c22d1bce7dc4a6641d4215a50f476f429
# good: [fbe057e874c7037982dea38235e8b9a9be05a8d5] s390/cpu_mf: Convert to use flag output macros
git bisect good fbe057e874c7037982dea38235e8b9a9be05a8d5
# bad: [2f3cc8e441c9f657ff036c56baaab7dddbd0b350] io_uring/napi: protect concurrent io_napi_entry timeout accesses
git bisect bad 2f3cc8e441c9f657ff036c56baaab7dddbd0b350
# good: [d090bffab609762af06dec295a305ce270941b42] io_uring/memmap: explicitly return -EFAULT for mmap on NULL rings
git bisect good d090bffab609762af06dec295a305ce270941b42
# bad: [3597f2786b687a7f26361ce00a805ea0af41b65f] io_uring/rsrc: unify file and buffer resource tables
git bisect bad 3597f2786b687a7f26361ce00a805ea0af41b65f
# good: [ff1256b8f3c45f222bce19fbfc1e1bc498b31d03] io_uring/rsrc: move struct io_fixed_file to rsrc.h header
git bisect good ff1256b8f3c45f222bce19fbfc1e1bc498b31d03
# bad: [7029acd8a950393ee3a3d8e1a7ee1a9b77808a3b] io_uring/rsrc: get rid of per-ring io_rsrc_node list
git bisect bad 7029acd8a950393ee3a3d8e1a7ee1a9b77808a3b
# bad: [743fb58a35cde8fe27b07ee5a985ae76563845e3] io_uring/splice: open code 2nd direct file assignment
git bisect bad 743fb58a35cde8fe27b07ee5a985ae76563845e3
# bad: [aaa736b186239b7dc7778ae94c75f26c96972796] io_uring: specify freeptr usage for SLAB_TYPESAFE_BY_RCU io_kiocb cache
git bisect bad aaa736b186239b7dc7778ae94c75f26c96972796
# first bad commit: [aaa736b186239b7dc7778ae94c75f26c96972796] io_uring: specify freeptr usage for SLAB_TYPESAFE_BY_RCU io_kiocb cache

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 03/14] io_uring: specify freeptr usage for SLAB_TYPESAFE_BY_RCU io_kiocb cache
  2024-11-19 15:36   ` Guenter Roeck
@ 2024-11-19 16:02     ` Jens Axboe
  2024-11-19 16:21       ` Guenter Roeck
  0 siblings, 1 reply; 36+ messages in thread
From: Jens Axboe @ 2024-11-19 16:02 UTC (permalink / raw)
  To: Guenter Roeck; +Cc: io-uring

On 11/19/24 8:36 AM, Guenter Roeck wrote:
> Hi,
> 
> On Tue, Oct 29, 2024 at 09:16:32AM -0600, Jens Axboe wrote:
>> Doesn't matter right now as there's still some bytes left for it, but
>> let's prepare for the io_kiocb potentially growing and add a specific
>> freeptr offset for it.
>>
>> Signed-off-by: Jens Axboe <[email protected]>
> 
> This patch triggers:
> 
> Kernel panic - not syncing: __kmem_cache_create_args: Failed to create slab 'io_kiocb'. Error -22
> CPU: 0 UID: 0 PID: 1 Comm: swapper Not tainted 6.12.0-mac-00971-g158f238aa69d #1
> Stack from 00c63e5c:
>         00c63e5c 00612c1c 00612c1c 00000300 00000001 005f3ce6 004b9044 00612c1c
>         004ae21e 00000310 000000b6 005f3ce6 005f3ce6 ffffffea ffffffea 00797244
>         00c63f20 000c6974 005ee588 004c9051 005f3ce6 ffffffea 000000a5 00c614a0
>         004a72c2 0002cb62 000c675e 004adb58 0076f28a 005f3ce6 000000b6 00c63ef4
>         00000310 00c63ef4 00000000 00000016 0076f23e 00c63f4c 00000010 00000004
>         00000038 0000009a 01000000 00000000 00000000 00000000 000020e0 0076f23e
> Call Trace: [<004b9044>] dump_stack+0xc/0x10
>  [<004ae21e>] panic+0xc4/0x252
>  [<000c6974>] __kmem_cache_create_args+0x216/0x26c
>  [<004a72c2>] strcpy+0x0/0x1c
>  [<0002cb62>] parse_args+0x0/0x1f2
>  [<000c675e>] __kmem_cache_create_args+0x0/0x26c
>  [<004adb58>] memset+0x0/0x8c
>  [<0076f28a>] io_uring_init+0x4c/0xca
>  [<0076f23e>] io_uring_init+0x0/0xca
>  [<000020e0>] do_one_initcall+0x32/0x192
>  [<0076f23e>] io_uring_init+0x0/0xca
>  [<0000211c>] do_one_initcall+0x6e/0x192
>  [<004a72c2>] strcpy+0x0/0x1c
>  [<0002cb62>] parse_args+0x0/0x1f2
>  [<000020ae>] do_one_initcall+0x0/0x192
>  [<0075c4e2>] kernel_init_freeable+0x1a0/0x1a4
>  [<0076f23e>] io_uring_init+0x0/0xca
>  [<004b911a>] kernel_init+0x0/0xec
>  [<004b912e>] kernel_init+0x14/0xec
>  [<004b911a>] kernel_init+0x0/0xec
>  [<0000252c>] ret_from_kernel_thread+0xc/0x14
> 
> when trying to boot the m68k:q800 machine in qemu.
> 
> An added debug message in create_cache() shows the reason:
> 
> #### freeptr_offset=154 object_size=182 flags=0x310 aligned=0 sizeof(freeptr_t)=4
> 
> freeptr_offset would need to be 4-byte aligned but that is not the
> case on m68k.

Why is ->work 2-byte aligned to begin with on m68k?!

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 03/14] io_uring: specify freeptr usage for SLAB_TYPESAFE_BY_RCU io_kiocb cache
  2024-11-19 16:02     ` Jens Axboe
@ 2024-11-19 16:21       ` Guenter Roeck
  2024-11-19 17:49         ` Geert Uytterhoeven
  0 siblings, 1 reply; 36+ messages in thread
From: Guenter Roeck @ 2024-11-19 16:21 UTC (permalink / raw)
  To: Jens Axboe; +Cc: io-uring, linux-m68k, Geert Uytterhoeven

On 11/19/24 08:02, Jens Axboe wrote:
> On 11/19/24 8:36 AM, Guenter Roeck wrote:
>> Hi,
>>
>> On Tue, Oct 29, 2024 at 09:16:32AM -0600, Jens Axboe wrote:
>>> Doesn't matter right now as there's still some bytes left for it, but
>>> let's prepare for the io_kiocb potentially growing and add a specific
>>> freeptr offset for it.
>>>
>>> Signed-off-by: Jens Axboe <[email protected]>
>>
>> This patch triggers:
>>
>> Kernel panic - not syncing: __kmem_cache_create_args: Failed to create slab 'io_kiocb'. Error -22
>> CPU: 0 UID: 0 PID: 1 Comm: swapper Not tainted 6.12.0-mac-00971-g158f238aa69d #1
>> Stack from 00c63e5c:
>>          00c63e5c 00612c1c 00612c1c 00000300 00000001 005f3ce6 004b9044 00612c1c
>>          004ae21e 00000310 000000b6 005f3ce6 005f3ce6 ffffffea ffffffea 00797244
>>          00c63f20 000c6974 005ee588 004c9051 005f3ce6 ffffffea 000000a5 00c614a0
>>          004a72c2 0002cb62 000c675e 004adb58 0076f28a 005f3ce6 000000b6 00c63ef4
>>          00000310 00c63ef4 00000000 00000016 0076f23e 00c63f4c 00000010 00000004
>>          00000038 0000009a 01000000 00000000 00000000 00000000 000020e0 0076f23e
>> Call Trace: [<004b9044>] dump_stack+0xc/0x10
>>   [<004ae21e>] panic+0xc4/0x252
>>   [<000c6974>] __kmem_cache_create_args+0x216/0x26c
>>   [<004a72c2>] strcpy+0x0/0x1c
>>   [<0002cb62>] parse_args+0x0/0x1f2
>>   [<000c675e>] __kmem_cache_create_args+0x0/0x26c
>>   [<004adb58>] memset+0x0/0x8c
>>   [<0076f28a>] io_uring_init+0x4c/0xca
>>   [<0076f23e>] io_uring_init+0x0/0xca
>>   [<000020e0>] do_one_initcall+0x32/0x192
>>   [<0076f23e>] io_uring_init+0x0/0xca
>>   [<0000211c>] do_one_initcall+0x6e/0x192
>>   [<004a72c2>] strcpy+0x0/0x1c
>>   [<0002cb62>] parse_args+0x0/0x1f2
>>   [<000020ae>] do_one_initcall+0x0/0x192
>>   [<0075c4e2>] kernel_init_freeable+0x1a0/0x1a4
>>   [<0076f23e>] io_uring_init+0x0/0xca
>>   [<004b911a>] kernel_init+0x0/0xec
>>   [<004b912e>] kernel_init+0x14/0xec
>>   [<004b911a>] kernel_init+0x0/0xec
>>   [<0000252c>] ret_from_kernel_thread+0xc/0x14
>>
>> when trying to boot the m68k:q800 machine in qemu.
>>
>> An added debug message in create_cache() shows the reason:
>>
>> #### freeptr_offset=154 object_size=182 flags=0x310 aligned=0 sizeof(freeptr_t)=4
>>
>> freeptr_offset would need to be 4-byte aligned but that is not the
>> case on m68k.
> 
> Why is ->work 2-byte aligned to begin with on m68k?!
> 

My understanding is that m68k does not align pointers.

Copying Geert and the m68k mailing list for feedback. Sorry, I should have done
that earlier.

Guenter


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 03/14] io_uring: specify freeptr usage for SLAB_TYPESAFE_BY_RCU io_kiocb cache
  2024-11-19 16:21       ` Guenter Roeck
@ 2024-11-19 17:49         ` Geert Uytterhoeven
  2024-11-19 19:00           ` Jens Axboe
  0 siblings, 1 reply; 36+ messages in thread
From: Geert Uytterhoeven @ 2024-11-19 17:49 UTC (permalink / raw)
  To: Guenter Roeck; +Cc: Jens Axboe, io-uring, linux-m68k

On Tue, Nov 19, 2024 at 5:21 PM Guenter Roeck <[email protected]> wrote:
> On 11/19/24 08:02, Jens Axboe wrote:
> > On 11/19/24 8:36 AM, Guenter Roeck wrote:
> >> On Tue, Oct 29, 2024 at 09:16:32AM -0600, Jens Axboe wrote:
> >>> Doesn't matter right now as there's still some bytes left for it, but
> >>> let's prepare for the io_kiocb potentially growing and add a specific
> >>> freeptr offset for it.
> >>>
> >>> Signed-off-by: Jens Axboe <[email protected]>
> >>
> >> This patch triggers:
> >>
> >> Kernel panic - not syncing: __kmem_cache_create_args: Failed to create slab 'io_kiocb'. Error -22
> >> CPU: 0 UID: 0 PID: 1 Comm: swapper Not tainted 6.12.0-mac-00971-g158f238aa69d #1
> >> Stack from 00c63e5c:
> >>          00c63e5c 00612c1c 00612c1c 00000300 00000001 005f3ce6 004b9044 00612c1c
> >>          004ae21e 00000310 000000b6 005f3ce6 005f3ce6 ffffffea ffffffea 00797244
> >>          00c63f20 000c6974 005ee588 004c9051 005f3ce6 ffffffea 000000a5 00c614a0
> >>          004a72c2 0002cb62 000c675e 004adb58 0076f28a 005f3ce6 000000b6 00c63ef4
> >>          00000310 00c63ef4 00000000 00000016 0076f23e 00c63f4c 00000010 00000004
> >>          00000038 0000009a 01000000 00000000 00000000 00000000 000020e0 0076f23e
> >> Call Trace: [<004b9044>] dump_stack+0xc/0x10
> >>   [<004ae21e>] panic+0xc4/0x252
> >>   [<000c6974>] __kmem_cache_create_args+0x216/0x26c
> >>   [<004a72c2>] strcpy+0x0/0x1c
> >>   [<0002cb62>] parse_args+0x0/0x1f2
> >>   [<000c675e>] __kmem_cache_create_args+0x0/0x26c
> >>   [<004adb58>] memset+0x0/0x8c
> >>   [<0076f28a>] io_uring_init+0x4c/0xca
> >>   [<0076f23e>] io_uring_init+0x0/0xca
> >>   [<000020e0>] do_one_initcall+0x32/0x192
> >>   [<0076f23e>] io_uring_init+0x0/0xca
> >>   [<0000211c>] do_one_initcall+0x6e/0x192
> >>   [<004a72c2>] strcpy+0x0/0x1c
> >>   [<0002cb62>] parse_args+0x0/0x1f2
> >>   [<000020ae>] do_one_initcall+0x0/0x192
> >>   [<0075c4e2>] kernel_init_freeable+0x1a0/0x1a4
> >>   [<0076f23e>] io_uring_init+0x0/0xca
> >>   [<004b911a>] kernel_init+0x0/0xec
> >>   [<004b912e>] kernel_init+0x14/0xec
> >>   [<004b911a>] kernel_init+0x0/0xec
> >>   [<0000252c>] ret_from_kernel_thread+0xc/0x14
> >>
> >> when trying to boot the m68k:q800 machine in qemu.
> >>
> >> An added debug message in create_cache() shows the reason:
> >>
> >> #### freeptr_offset=154 object_size=182 flags=0x310 aligned=0 sizeof(freeptr_t)=4
> >>
> >> freeptr_offset would need to be 4-byte aligned but that is not the
> >> case on m68k.
> >
> > Why is ->work 2-byte aligned to begin with on m68k?!
>
> My understanding is that m68k does not align pointers.

The minimum alignment for multi-byte integral values on m68k is
2 bytes.

See also the comment at
https://elixir.bootlin.com/linux/v6.12/source/include/linux/maple_tree.h#L46

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- [email protected]

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 03/14] io_uring: specify freeptr usage for SLAB_TYPESAFE_BY_RCU io_kiocb cache
  2024-11-19 17:49         ` Geert Uytterhoeven
@ 2024-11-19 19:00           ` Jens Axboe
  2024-11-19 19:02             ` Geert Uytterhoeven
  0 siblings, 1 reply; 36+ messages in thread
From: Jens Axboe @ 2024-11-19 19:00 UTC (permalink / raw)
  To: Geert Uytterhoeven, Guenter Roeck; +Cc: io-uring, linux-m68k

On 11/19/24 10:49 AM, Geert Uytterhoeven wrote:
> On Tue, Nov 19, 2024 at 5:21?PM Guenter Roeck <[email protected]> wrote:
>> On 11/19/24 08:02, Jens Axboe wrote:
>>> On 11/19/24 8:36 AM, Guenter Roeck wrote:
>>>> On Tue, Oct 29, 2024 at 09:16:32AM -0600, Jens Axboe wrote:
>>>>> Doesn't matter right now as there's still some bytes left for it, but
>>>>> let's prepare for the io_kiocb potentially growing and add a specific
>>>>> freeptr offset for it.
>>>>>
>>>>> Signed-off-by: Jens Axboe <[email protected]>
>>>>
>>>> This patch triggers:
>>>>
>>>> Kernel panic - not syncing: __kmem_cache_create_args: Failed to create slab 'io_kiocb'. Error -22
>>>> CPU: 0 UID: 0 PID: 1 Comm: swapper Not tainted 6.12.0-mac-00971-g158f238aa69d #1
>>>> Stack from 00c63e5c:
>>>>          00c63e5c 00612c1c 00612c1c 00000300 00000001 005f3ce6 004b9044 00612c1c
>>>>          004ae21e 00000310 000000b6 005f3ce6 005f3ce6 ffffffea ffffffea 00797244
>>>>          00c63f20 000c6974 005ee588 004c9051 005f3ce6 ffffffea 000000a5 00c614a0
>>>>          004a72c2 0002cb62 000c675e 004adb58 0076f28a 005f3ce6 000000b6 00c63ef4
>>>>          00000310 00c63ef4 00000000 00000016 0076f23e 00c63f4c 00000010 00000004
>>>>          00000038 0000009a 01000000 00000000 00000000 00000000 000020e0 0076f23e
>>>> Call Trace: [<004b9044>] dump_stack+0xc/0x10
>>>>   [<004ae21e>] panic+0xc4/0x252
>>>>   [<000c6974>] __kmem_cache_create_args+0x216/0x26c
>>>>   [<004a72c2>] strcpy+0x0/0x1c
>>>>   [<0002cb62>] parse_args+0x0/0x1f2
>>>>   [<000c675e>] __kmem_cache_create_args+0x0/0x26c
>>>>   [<004adb58>] memset+0x0/0x8c
>>>>   [<0076f28a>] io_uring_init+0x4c/0xca
>>>>   [<0076f23e>] io_uring_init+0x0/0xca
>>>>   [<000020e0>] do_one_initcall+0x32/0x192
>>>>   [<0076f23e>] io_uring_init+0x0/0xca
>>>>   [<0000211c>] do_one_initcall+0x6e/0x192
>>>>   [<004a72c2>] strcpy+0x0/0x1c
>>>>   [<0002cb62>] parse_args+0x0/0x1f2
>>>>   [<000020ae>] do_one_initcall+0x0/0x192
>>>>   [<0075c4e2>] kernel_init_freeable+0x1a0/0x1a4
>>>>   [<0076f23e>] io_uring_init+0x0/0xca
>>>>   [<004b911a>] kernel_init+0x0/0xec
>>>>   [<004b912e>] kernel_init+0x14/0xec
>>>>   [<004b911a>] kernel_init+0x0/0xec
>>>>   [<0000252c>] ret_from_kernel_thread+0xc/0x14
>>>>
>>>> when trying to boot the m68k:q800 machine in qemu.
>>>>
>>>> An added debug message in create_cache() shows the reason:
>>>>
>>>> #### freeptr_offset=154 object_size=182 flags=0x310 aligned=0 sizeof(freeptr_t)=4
>>>>
>>>> freeptr_offset would need to be 4-byte aligned but that is not the
>>>> case on m68k.
>>>
>>> Why is ->work 2-byte aligned to begin with on m68k?!
>>
>> My understanding is that m68k does not align pointers.
> 
> The minimum alignment for multi-byte integral values on m68k is
> 2 bytes.
> 
> See also the comment at
> https://elixir.bootlin.com/linux/v6.12/source/include/linux/maple_tree.h#L46

Maybe it's time we put m68k to bed? :-)

We can add a forced alignment ->work to be 4 bytes, won't change
anything on anything remotely current. But does feel pretty hacky to
need to align based on some ancient thing.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 03/14] io_uring: specify freeptr usage for SLAB_TYPESAFE_BY_RCU io_kiocb cache
  2024-11-19 19:00           ` Jens Axboe
@ 2024-11-19 19:02             ` Geert Uytterhoeven
  2024-11-19 19:10               ` Jens Axboe
  0 siblings, 1 reply; 36+ messages in thread
From: Geert Uytterhoeven @ 2024-11-19 19:02 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Guenter Roeck, io-uring, linux-m68k

Hi Jens.

On Tue, Nov 19, 2024 at 8:00 PM Jens Axboe <[email protected]> wrote:
> On 11/19/24 10:49 AM, Geert Uytterhoeven wrote:
> > On Tue, Nov 19, 2024 at 5:21?PM Guenter Roeck <[email protected]> wrote:
> >> On 11/19/24 08:02, Jens Axboe wrote:
> >>> On 11/19/24 8:36 AM, Guenter Roeck wrote:
> >>>> On Tue, Oct 29, 2024 at 09:16:32AM -0600, Jens Axboe wrote:
> >>>>> Doesn't matter right now as there's still some bytes left for it, but
> >>>>> let's prepare for the io_kiocb potentially growing and add a specific
> >>>>> freeptr offset for it.
> >>>>>
> >>>>> Signed-off-by: Jens Axboe <[email protected]>
> >>>>
> >>>> This patch triggers:
> >>>>
> >>>> Kernel panic - not syncing: __kmem_cache_create_args: Failed to create slab 'io_kiocb'. Error -22
> >>>> CPU: 0 UID: 0 PID: 1 Comm: swapper Not tainted 6.12.0-mac-00971-g158f238aa69d #1
> >>>> Stack from 00c63e5c:
> >>>>          00c63e5c 00612c1c 00612c1c 00000300 00000001 005f3ce6 004b9044 00612c1c
> >>>>          004ae21e 00000310 000000b6 005f3ce6 005f3ce6 ffffffea ffffffea 00797244
> >>>>          00c63f20 000c6974 005ee588 004c9051 005f3ce6 ffffffea 000000a5 00c614a0
> >>>>          004a72c2 0002cb62 000c675e 004adb58 0076f28a 005f3ce6 000000b6 00c63ef4
> >>>>          00000310 00c63ef4 00000000 00000016 0076f23e 00c63f4c 00000010 00000004
> >>>>          00000038 0000009a 01000000 00000000 00000000 00000000 000020e0 0076f23e
> >>>> Call Trace: [<004b9044>] dump_stack+0xc/0x10
> >>>>   [<004ae21e>] panic+0xc4/0x252
> >>>>   [<000c6974>] __kmem_cache_create_args+0x216/0x26c
> >>>>   [<004a72c2>] strcpy+0x0/0x1c
> >>>>   [<0002cb62>] parse_args+0x0/0x1f2
> >>>>   [<000c675e>] __kmem_cache_create_args+0x0/0x26c
> >>>>   [<004adb58>] memset+0x0/0x8c
> >>>>   [<0076f28a>] io_uring_init+0x4c/0xca
> >>>>   [<0076f23e>] io_uring_init+0x0/0xca
> >>>>   [<000020e0>] do_one_initcall+0x32/0x192
> >>>>   [<0076f23e>] io_uring_init+0x0/0xca
> >>>>   [<0000211c>] do_one_initcall+0x6e/0x192
> >>>>   [<004a72c2>] strcpy+0x0/0x1c
> >>>>   [<0002cb62>] parse_args+0x0/0x1f2
> >>>>   [<000020ae>] do_one_initcall+0x0/0x192
> >>>>   [<0075c4e2>] kernel_init_freeable+0x1a0/0x1a4
> >>>>   [<0076f23e>] io_uring_init+0x0/0xca
> >>>>   [<004b911a>] kernel_init+0x0/0xec
> >>>>   [<004b912e>] kernel_init+0x14/0xec
> >>>>   [<004b911a>] kernel_init+0x0/0xec
> >>>>   [<0000252c>] ret_from_kernel_thread+0xc/0x14
> >>>>
> >>>> when trying to boot the m68k:q800 machine in qemu.
> >>>>
> >>>> An added debug message in create_cache() shows the reason:
> >>>>
> >>>> #### freeptr_offset=154 object_size=182 flags=0x310 aligned=0 sizeof(freeptr_t)=4
> >>>>
> >>>> freeptr_offset would need to be 4-byte aligned but that is not the
> >>>> case on m68k.
> >>>
> >>> Why is ->work 2-byte aligned to begin with on m68k?!
> >>
> >> My understanding is that m68k does not align pointers.
> >
> > The minimum alignment for multi-byte integral values on m68k is
> > 2 bytes.
> >
> > See also the comment at
> > https://elixir.bootlin.com/linux/v6.12/source/include/linux/maple_tree.h#L46
>
> Maybe it's time we put m68k to bed? :-)
>
> We can add a forced alignment ->work to be 4 bytes, won't change
> anything on anything remotely current. But does feel pretty hacky to
> need to align based on some ancient thing.

Why does freeptr_offset need to be 4-byte aligned?

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- [email protected]

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 03/14] io_uring: specify freeptr usage for SLAB_TYPESAFE_BY_RCU io_kiocb cache
  2024-11-19 19:02             ` Geert Uytterhoeven
@ 2024-11-19 19:10               ` Jens Axboe
  2024-11-19 19:25                 ` Geert Uytterhoeven
  0 siblings, 1 reply; 36+ messages in thread
From: Jens Axboe @ 2024-11-19 19:10 UTC (permalink / raw)
  To: Geert Uytterhoeven; +Cc: Guenter Roeck, io-uring, linux-m68k

On 11/19/24 12:02 PM, Geert Uytterhoeven wrote:
> Hi Jens.
> 
> On Tue, Nov 19, 2024 at 8:00?PM Jens Axboe <[email protected]> wrote:
>> On 11/19/24 10:49 AM, Geert Uytterhoeven wrote:
>>> On Tue, Nov 19, 2024 at 5:21?PM Guenter Roeck <[email protected]> wrote:
>>>> On 11/19/24 08:02, Jens Axboe wrote:
>>>>> On 11/19/24 8:36 AM, Guenter Roeck wrote:
>>>>>> On Tue, Oct 29, 2024 at 09:16:32AM -0600, Jens Axboe wrote:
>>>>>>> Doesn't matter right now as there's still some bytes left for it, but
>>>>>>> let's prepare for the io_kiocb potentially growing and add a specific
>>>>>>> freeptr offset for it.
>>>>>>>
>>>>>>> Signed-off-by: Jens Axboe <[email protected]>
>>>>>>
>>>>>> This patch triggers:
>>>>>>
>>>>>> Kernel panic - not syncing: __kmem_cache_create_args: Failed to create slab 'io_kiocb'. Error -22
>>>>>> CPU: 0 UID: 0 PID: 1 Comm: swapper Not tainted 6.12.0-mac-00971-g158f238aa69d #1
>>>>>> Stack from 00c63e5c:
>>>>>>          00c63e5c 00612c1c 00612c1c 00000300 00000001 005f3ce6 004b9044 00612c1c
>>>>>>          004ae21e 00000310 000000b6 005f3ce6 005f3ce6 ffffffea ffffffea 00797244
>>>>>>          00c63f20 000c6974 005ee588 004c9051 005f3ce6 ffffffea 000000a5 00c614a0
>>>>>>          004a72c2 0002cb62 000c675e 004adb58 0076f28a 005f3ce6 000000b6 00c63ef4
>>>>>>          00000310 00c63ef4 00000000 00000016 0076f23e 00c63f4c 00000010 00000004
>>>>>>          00000038 0000009a 01000000 00000000 00000000 00000000 000020e0 0076f23e
>>>>>> Call Trace: [<004b9044>] dump_stack+0xc/0x10
>>>>>>   [<004ae21e>] panic+0xc4/0x252
>>>>>>   [<000c6974>] __kmem_cache_create_args+0x216/0x26c
>>>>>>   [<004a72c2>] strcpy+0x0/0x1c
>>>>>>   [<0002cb62>] parse_args+0x0/0x1f2
>>>>>>   [<000c675e>] __kmem_cache_create_args+0x0/0x26c
>>>>>>   [<004adb58>] memset+0x0/0x8c
>>>>>>   [<0076f28a>] io_uring_init+0x4c/0xca
>>>>>>   [<0076f23e>] io_uring_init+0x0/0xca
>>>>>>   [<000020e0>] do_one_initcall+0x32/0x192
>>>>>>   [<0076f23e>] io_uring_init+0x0/0xca
>>>>>>   [<0000211c>] do_one_initcall+0x6e/0x192
>>>>>>   [<004a72c2>] strcpy+0x0/0x1c
>>>>>>   [<0002cb62>] parse_args+0x0/0x1f2
>>>>>>   [<000020ae>] do_one_initcall+0x0/0x192
>>>>>>   [<0075c4e2>] kernel_init_freeable+0x1a0/0x1a4
>>>>>>   [<0076f23e>] io_uring_init+0x0/0xca
>>>>>>   [<004b911a>] kernel_init+0x0/0xec
>>>>>>   [<004b912e>] kernel_init+0x14/0xec
>>>>>>   [<004b911a>] kernel_init+0x0/0xec
>>>>>>   [<0000252c>] ret_from_kernel_thread+0xc/0x14
>>>>>>
>>>>>> when trying to boot the m68k:q800 machine in qemu.
>>>>>>
>>>>>> An added debug message in create_cache() shows the reason:
>>>>>>
>>>>>> #### freeptr_offset=154 object_size=182 flags=0x310 aligned=0 sizeof(freeptr_t)=4
>>>>>>
>>>>>> freeptr_offset would need to be 4-byte aligned but that is not the
>>>>>> case on m68k.
>>>>>
>>>>> Why is ->work 2-byte aligned to begin with on m68k?!
>>>>
>>>> My understanding is that m68k does not align pointers.
>>>
>>> The minimum alignment for multi-byte integral values on m68k is
>>> 2 bytes.
>>>
>>> See also the comment at
>>> https://elixir.bootlin.com/linux/v6.12/source/include/linux/maple_tree.h#L46
>>
>> Maybe it's time we put m68k to bed? :-)
>>
>> We can add a forced alignment ->work to be 4 bytes, won't change
>> anything on anything remotely current. But does feel pretty hacky to
>> need to align based on some ancient thing.
> 
> Why does freeptr_offset need to be 4-byte aligned?

Didn't check, but it's slab/slub complaining using a 2-byte aligned
address for the free pointer offset. It's explicitly checking:

	/* If a custom freelist pointer is requested make sure it's sane. */
	err = -EINVAL;
	if (args->use_freeptr_offset &&
	    (args->freeptr_offset >= object_size ||
	     !(flags & SLAB_TYPESAFE_BY_RCU) ||
	     !IS_ALIGNED(args->freeptr_offset, sizeof(freeptr_t))))
		goto out;

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 03/14] io_uring: specify freeptr usage for SLAB_TYPESAFE_BY_RCU io_kiocb cache
  2024-11-19 19:10               ` Jens Axboe
@ 2024-11-19 19:25                 ` Geert Uytterhoeven
  2024-11-19 19:30                   ` Jens Axboe
  0 siblings, 1 reply; 36+ messages in thread
From: Geert Uytterhoeven @ 2024-11-19 19:25 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Guenter Roeck, io-uring, linux-m68k

Hi Jens,

On Tue, Nov 19, 2024 at 8:10 PM Jens Axboe <[email protected]> wrote:
> On 11/19/24 12:02 PM, Geert Uytterhoeven wrote:
> > On Tue, Nov 19, 2024 at 8:00?PM Jens Axboe <[email protected]> wrote:
> >> On 11/19/24 10:49 AM, Geert Uytterhoeven wrote:
> >>> On Tue, Nov 19, 2024 at 5:21?PM Guenter Roeck <[email protected]> wrote:
> >>>> On 11/19/24 08:02, Jens Axboe wrote:
> >>>>> On 11/19/24 8:36 AM, Guenter Roeck wrote:
> >>>>>> On Tue, Oct 29, 2024 at 09:16:32AM -0600, Jens Axboe wrote:
> >>>>>>> Doesn't matter right now as there's still some bytes left for it, but
> >>>>>>> let's prepare for the io_kiocb potentially growing and add a specific
> >>>>>>> freeptr offset for it.
> >>>>>>>
> >>>>>>> Signed-off-by: Jens Axboe <[email protected]>
> >>>>>>
> >>>>>> This patch triggers:
> >>>>>>
> >>>>>> Kernel panic - not syncing: __kmem_cache_create_args: Failed to create slab 'io_kiocb'. Error -22
> >>>>>> CPU: 0 UID: 0 PID: 1 Comm: swapper Not tainted 6.12.0-mac-00971-g158f238aa69d #1
> >>>>>> Stack from 00c63e5c:
> >>>>>>          00c63e5c 00612c1c 00612c1c 00000300 00000001 005f3ce6 004b9044 00612c1c
> >>>>>>          004ae21e 00000310 000000b6 005f3ce6 005f3ce6 ffffffea ffffffea 00797244
> >>>>>>          00c63f20 000c6974 005ee588 004c9051 005f3ce6 ffffffea 000000a5 00c614a0
> >>>>>>          004a72c2 0002cb62 000c675e 004adb58 0076f28a 005f3ce6 000000b6 00c63ef4
> >>>>>>          00000310 00c63ef4 00000000 00000016 0076f23e 00c63f4c 00000010 00000004
> >>>>>>          00000038 0000009a 01000000 00000000 00000000 00000000 000020e0 0076f23e
> >>>>>> Call Trace: [<004b9044>] dump_stack+0xc/0x10
> >>>>>>   [<004ae21e>] panic+0xc4/0x252
> >>>>>>   [<000c6974>] __kmem_cache_create_args+0x216/0x26c
> >>>>>>   [<004a72c2>] strcpy+0x0/0x1c
> >>>>>>   [<0002cb62>] parse_args+0x0/0x1f2
> >>>>>>   [<000c675e>] __kmem_cache_create_args+0x0/0x26c
> >>>>>>   [<004adb58>] memset+0x0/0x8c
> >>>>>>   [<0076f28a>] io_uring_init+0x4c/0xca
> >>>>>>   [<0076f23e>] io_uring_init+0x0/0xca
> >>>>>>   [<000020e0>] do_one_initcall+0x32/0x192
> >>>>>>   [<0076f23e>] io_uring_init+0x0/0xca
> >>>>>>   [<0000211c>] do_one_initcall+0x6e/0x192
> >>>>>>   [<004a72c2>] strcpy+0x0/0x1c
> >>>>>>   [<0002cb62>] parse_args+0x0/0x1f2
> >>>>>>   [<000020ae>] do_one_initcall+0x0/0x192
> >>>>>>   [<0075c4e2>] kernel_init_freeable+0x1a0/0x1a4
> >>>>>>   [<0076f23e>] io_uring_init+0x0/0xca
> >>>>>>   [<004b911a>] kernel_init+0x0/0xec
> >>>>>>   [<004b912e>] kernel_init+0x14/0xec
> >>>>>>   [<004b911a>] kernel_init+0x0/0xec
> >>>>>>   [<0000252c>] ret_from_kernel_thread+0xc/0x14
> >>>>>>
> >>>>>> when trying to boot the m68k:q800 machine in qemu.
> >>>>>>
> >>>>>> An added debug message in create_cache() shows the reason:
> >>>>>>
> >>>>>> #### freeptr_offset=154 object_size=182 flags=0x310 aligned=0 sizeof(freeptr_t)=4
> >>>>>>
> >>>>>> freeptr_offset would need to be 4-byte aligned but that is not the
> >>>>>> case on m68k.
> >>>>>
> >>>>> Why is ->work 2-byte aligned to begin with on m68k?!
> >>>>
> >>>> My understanding is that m68k does not align pointers.
> >>>
> >>> The minimum alignment for multi-byte integral values on m68k is
> >>> 2 bytes.
> >>>
> >>> See also the comment at
> >>> https://elixir.bootlin.com/linux/v6.12/source/include/linux/maple_tree.h#L46
> >>
> >> Maybe it's time we put m68k to bed? :-)
> >>
> >> We can add a forced alignment ->work to be 4 bytes, won't change
> >> anything on anything remotely current. But does feel pretty hacky to
> >> need to align based on some ancient thing.
> >
> > Why does freeptr_offset need to be 4-byte aligned?
>
> Didn't check, but it's slab/slub complaining using a 2-byte aligned
> address for the free pointer offset. It's explicitly checking:
>
>         /* If a custom freelist pointer is requested make sure it's sane. */
>         err = -EINVAL;
>         if (args->use_freeptr_offset &&
>             (args->freeptr_offset >= object_size ||
>              !(flags & SLAB_TYPESAFE_BY_RCU) ||
>              !IS_ALIGNED(args->freeptr_offset, sizeof(freeptr_t))))
>                 goto out;

It is not guaranteed that alignof(freeptr_t) >= sizeof(freeptr_t)
(free_ptr is sort of a long). If freeptr_offset must be a multiple of
4 or 8 bytes,
the code that assigns it must make sure that is true.

I guess this is the code in fs/file_table.c:

    .freeptr_offset = offsetof(struct file, f_freeptr),

which references:

    include/linux/fs.h:           freeptr_t               f_freeptr;

I guess the simplest solution is to add an __aligned(sizeof(freeptr_t))
(or __aligned(sizeof(long)) to the definition of freeptr_t:

    include/linux/slab.h:typedef struct { unsigned long v; } freeptr_t;

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- [email protected]

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 03/14] io_uring: specify freeptr usage for SLAB_TYPESAFE_BY_RCU io_kiocb cache
  2024-11-19 19:25                 ` Geert Uytterhoeven
@ 2024-11-19 19:30                   ` Jens Axboe
  2024-11-19 19:41                     ` Geert Uytterhoeven
  0 siblings, 1 reply; 36+ messages in thread
From: Jens Axboe @ 2024-11-19 19:30 UTC (permalink / raw)
  To: Geert Uytterhoeven; +Cc: Guenter Roeck, io-uring, linux-m68k

On 11/19/24 12:25 PM, Geert Uytterhoeven wrote:
> Hi Jens,
> 
> On Tue, Nov 19, 2024 at 8:10?PM Jens Axboe <[email protected]> wrote:
>> On 11/19/24 12:02 PM, Geert Uytterhoeven wrote:
>>> On Tue, Nov 19, 2024 at 8:00?PM Jens Axboe <[email protected]> wrote:
>>>> On 11/19/24 10:49 AM, Geert Uytterhoeven wrote:
>>>>> On Tue, Nov 19, 2024 at 5:21?PM Guenter Roeck <[email protected]> wrote:
>>>>>> On 11/19/24 08:02, Jens Axboe wrote:
>>>>>>> On 11/19/24 8:36 AM, Guenter Roeck wrote:
>>>>>>>> On Tue, Oct 29, 2024 at 09:16:32AM -0600, Jens Axboe wrote:
>>>>>>>>> Doesn't matter right now as there's still some bytes left for it, but
>>>>>>>>> let's prepare for the io_kiocb potentially growing and add a specific
>>>>>>>>> freeptr offset for it.
>>>>>>>>>
>>>>>>>>> Signed-off-by: Jens Axboe <[email protected]>
>>>>>>>>
>>>>>>>> This patch triggers:
>>>>>>>>
>>>>>>>> Kernel panic - not syncing: __kmem_cache_create_args: Failed to create slab 'io_kiocb'. Error -22
>>>>>>>> CPU: 0 UID: 0 PID: 1 Comm: swapper Not tainted 6.12.0-mac-00971-g158f238aa69d #1
>>>>>>>> Stack from 00c63e5c:
>>>>>>>>          00c63e5c 00612c1c 00612c1c 00000300 00000001 005f3ce6 004b9044 00612c1c
>>>>>>>>          004ae21e 00000310 000000b6 005f3ce6 005f3ce6 ffffffea ffffffea 00797244
>>>>>>>>          00c63f20 000c6974 005ee588 004c9051 005f3ce6 ffffffea 000000a5 00c614a0
>>>>>>>>          004a72c2 0002cb62 000c675e 004adb58 0076f28a 005f3ce6 000000b6 00c63ef4
>>>>>>>>          00000310 00c63ef4 00000000 00000016 0076f23e 00c63f4c 00000010 00000004
>>>>>>>>          00000038 0000009a 01000000 00000000 00000000 00000000 000020e0 0076f23e
>>>>>>>> Call Trace: [<004b9044>] dump_stack+0xc/0x10
>>>>>>>>   [<004ae21e>] panic+0xc4/0x252
>>>>>>>>   [<000c6974>] __kmem_cache_create_args+0x216/0x26c
>>>>>>>>   [<004a72c2>] strcpy+0x0/0x1c
>>>>>>>>   [<0002cb62>] parse_args+0x0/0x1f2
>>>>>>>>   [<000c675e>] __kmem_cache_create_args+0x0/0x26c
>>>>>>>>   [<004adb58>] memset+0x0/0x8c
>>>>>>>>   [<0076f28a>] io_uring_init+0x4c/0xca
>>>>>>>>   [<0076f23e>] io_uring_init+0x0/0xca
>>>>>>>>   [<000020e0>] do_one_initcall+0x32/0x192
>>>>>>>>   [<0076f23e>] io_uring_init+0x0/0xca
>>>>>>>>   [<0000211c>] do_one_initcall+0x6e/0x192
>>>>>>>>   [<004a72c2>] strcpy+0x0/0x1c
>>>>>>>>   [<0002cb62>] parse_args+0x0/0x1f2
>>>>>>>>   [<000020ae>] do_one_initcall+0x0/0x192
>>>>>>>>   [<0075c4e2>] kernel_init_freeable+0x1a0/0x1a4
>>>>>>>>   [<0076f23e>] io_uring_init+0x0/0xca
>>>>>>>>   [<004b911a>] kernel_init+0x0/0xec
>>>>>>>>   [<004b912e>] kernel_init+0x14/0xec
>>>>>>>>   [<004b911a>] kernel_init+0x0/0xec
>>>>>>>>   [<0000252c>] ret_from_kernel_thread+0xc/0x14
>>>>>>>>
>>>>>>>> when trying to boot the m68k:q800 machine in qemu.
>>>>>>>>
>>>>>>>> An added debug message in create_cache() shows the reason:
>>>>>>>>
>>>>>>>> #### freeptr_offset=154 object_size=182 flags=0x310 aligned=0 sizeof(freeptr_t)=4
>>>>>>>>
>>>>>>>> freeptr_offset would need to be 4-byte aligned but that is not the
>>>>>>>> case on m68k.
>>>>>>>
>>>>>>> Why is ->work 2-byte aligned to begin with on m68k?!
>>>>>>
>>>>>> My understanding is that m68k does not align pointers.
>>>>>
>>>>> The minimum alignment for multi-byte integral values on m68k is
>>>>> 2 bytes.
>>>>>
>>>>> See also the comment at
>>>>> https://elixir.bootlin.com/linux/v6.12/source/include/linux/maple_tree.h#L46
>>>>
>>>> Maybe it's time we put m68k to bed? :-)
>>>>
>>>> We can add a forced alignment ->work to be 4 bytes, won't change
>>>> anything on anything remotely current. But does feel pretty hacky to
>>>> need to align based on some ancient thing.
>>>
>>> Why does freeptr_offset need to be 4-byte aligned?
>>
>> Didn't check, but it's slab/slub complaining using a 2-byte aligned
>> address for the free pointer offset. It's explicitly checking:
>>
>>         /* If a custom freelist pointer is requested make sure it's sane. */
>>         err = -EINVAL;
>>         if (args->use_freeptr_offset &&
>>             (args->freeptr_offset >= object_size ||
>>              !(flags & SLAB_TYPESAFE_BY_RCU) ||
>>              !IS_ALIGNED(args->freeptr_offset, sizeof(freeptr_t))))
>>                 goto out;
> 
> It is not guaranteed that alignof(freeptr_t) >= sizeof(freeptr_t)
> (free_ptr is sort of a long). If freeptr_offset must be a multiple of
> 4 or 8 bytes,
> the code that assigns it must make sure that is true.

Right, this is what the email is about...

> I guess this is the code in fs/file_table.c:
> 
>     .freeptr_offset = offsetof(struct file, f_freeptr),
> 
> which references:
> 
>     include/linux/fs.h:           freeptr_t               f_freeptr;
> 
> I guess the simplest solution is to add an __aligned(sizeof(freeptr_t))
> (or __aligned(sizeof(long)) to the definition of freeptr_t:
> 
>     include/linux/slab.h:typedef struct { unsigned long v; } freeptr_t;

It's not, it's struct io_kiocb->work, as per the stack trace in this
email.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 03/14] io_uring: specify freeptr usage for SLAB_TYPESAFE_BY_RCU io_kiocb cache
  2024-11-19 19:30                   ` Jens Axboe
@ 2024-11-19 19:41                     ` Geert Uytterhoeven
  2024-11-19 19:44                       ` Jens Axboe
  0 siblings, 1 reply; 36+ messages in thread
From: Geert Uytterhoeven @ 2024-11-19 19:41 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Guenter Roeck, io-uring, linux-m68k

Hi Jens,

On Tue, Nov 19, 2024 at 8:30 PM Jens Axboe <[email protected]> wrote:
> On 11/19/24 12:25 PM, Geert Uytterhoeven wrote:
> > On Tue, Nov 19, 2024 at 8:10?PM Jens Axboe <[email protected]> wrote:
> >> On 11/19/24 12:02 PM, Geert Uytterhoeven wrote:
> >>> On Tue, Nov 19, 2024 at 8:00?PM Jens Axboe <[email protected]> wrote:
> >>>> On 11/19/24 10:49 AM, Geert Uytterhoeven wrote:
> >>>>> On Tue, Nov 19, 2024 at 5:21?PM Guenter Roeck <[email protected]> wrote:
> >>>>>> On 11/19/24 08:02, Jens Axboe wrote:
> >>>>>>> On 11/19/24 8:36 AM, Guenter Roeck wrote:
> >>>>>>>> On Tue, Oct 29, 2024 at 09:16:32AM -0600, Jens Axboe wrote:
> >>>>>>>>> Doesn't matter right now as there's still some bytes left for it, but
> >>>>>>>>> let's prepare for the io_kiocb potentially growing and add a specific
> >>>>>>>>> freeptr offset for it.
> >>>>>>>>>
> >>>>>>>>> Signed-off-by: Jens Axboe <[email protected]>
> >>>>>>>>
> >>>>>>>> This patch triggers:
> >>>>>>>>
> >>>>>>>> Kernel panic - not syncing: __kmem_cache_create_args: Failed to create slab 'io_kiocb'. Error -22
> >>>>>>>> CPU: 0 UID: 0 PID: 1 Comm: swapper Not tainted 6.12.0-mac-00971-g158f238aa69d #1
> >>>>>>>> Stack from 00c63e5c:
> >>>>>>>>          00c63e5c 00612c1c 00612c1c 00000300 00000001 005f3ce6 004b9044 00612c1c
> >>>>>>>>          004ae21e 00000310 000000b6 005f3ce6 005f3ce6 ffffffea ffffffea 00797244
> >>>>>>>>          00c63f20 000c6974 005ee588 004c9051 005f3ce6 ffffffea 000000a5 00c614a0
> >>>>>>>>          004a72c2 0002cb62 000c675e 004adb58 0076f28a 005f3ce6 000000b6 00c63ef4
> >>>>>>>>          00000310 00c63ef4 00000000 00000016 0076f23e 00c63f4c 00000010 00000004
> >>>>>>>>          00000038 0000009a 01000000 00000000 00000000 00000000 000020e0 0076f23e
> >>>>>>>> Call Trace: [<004b9044>] dump_stack+0xc/0x10
> >>>>>>>>   [<004ae21e>] panic+0xc4/0x252
> >>>>>>>>   [<000c6974>] __kmem_cache_create_args+0x216/0x26c
> >>>>>>>>   [<004a72c2>] strcpy+0x0/0x1c
> >>>>>>>>   [<0002cb62>] parse_args+0x0/0x1f2
> >>>>>>>>   [<000c675e>] __kmem_cache_create_args+0x0/0x26c
> >>>>>>>>   [<004adb58>] memset+0x0/0x8c
> >>>>>>>>   [<0076f28a>] io_uring_init+0x4c/0xca
> >>>>>>>>   [<0076f23e>] io_uring_init+0x0/0xca
> >>>>>>>>   [<000020e0>] do_one_initcall+0x32/0x192
> >>>>>>>>   [<0076f23e>] io_uring_init+0x0/0xca
> >>>>>>>>   [<0000211c>] do_one_initcall+0x6e/0x192
> >>>>>>>>   [<004a72c2>] strcpy+0x0/0x1c
> >>>>>>>>   [<0002cb62>] parse_args+0x0/0x1f2
> >>>>>>>>   [<000020ae>] do_one_initcall+0x0/0x192
> >>>>>>>>   [<0075c4e2>] kernel_init_freeable+0x1a0/0x1a4
> >>>>>>>>   [<0076f23e>] io_uring_init+0x0/0xca
> >>>>>>>>   [<004b911a>] kernel_init+0x0/0xec
> >>>>>>>>   [<004b912e>] kernel_init+0x14/0xec
> >>>>>>>>   [<004b911a>] kernel_init+0x0/0xec
> >>>>>>>>   [<0000252c>] ret_from_kernel_thread+0xc/0x14
> >>>>>>>>
> >>>>>>>> when trying to boot the m68k:q800 machine in qemu.
> >>>>>>>>
> >>>>>>>> An added debug message in create_cache() shows the reason:
> >>>>>>>>
> >>>>>>>> #### freeptr_offset=154 object_size=182 flags=0x310 aligned=0 sizeof(freeptr_t)=4
> >>>>>>>>
> >>>>>>>> freeptr_offset would need to be 4-byte aligned but that is not the
> >>>>>>>> case on m68k.
> >>>>>>>
> >>>>>>> Why is ->work 2-byte aligned to begin with on m68k?!
> >>>>>>
> >>>>>> My understanding is that m68k does not align pointers.
> >>>>>
> >>>>> The minimum alignment for multi-byte integral values on m68k is
> >>>>> 2 bytes.
> >>>>>
> >>>>> See also the comment at
> >>>>> https://elixir.bootlin.com/linux/v6.12/source/include/linux/maple_tree.h#L46
> >>>>
> >>>> Maybe it's time we put m68k to bed? :-)
> >>>>
> >>>> We can add a forced alignment ->work to be 4 bytes, won't change
> >>>> anything on anything remotely current. But does feel pretty hacky to
> >>>> need to align based on some ancient thing.
> >>>
> >>> Why does freeptr_offset need to be 4-byte aligned?
> >>
> >> Didn't check, but it's slab/slub complaining using a 2-byte aligned
> >> address for the free pointer offset. It's explicitly checking:
> >>
> >>         /* If a custom freelist pointer is requested make sure it's sane. */
> >>         err = -EINVAL;
> >>         if (args->use_freeptr_offset &&
> >>             (args->freeptr_offset >= object_size ||
> >>              !(flags & SLAB_TYPESAFE_BY_RCU) ||
> >>              !IS_ALIGNED(args->freeptr_offset, sizeof(freeptr_t))))
> >>                 goto out;
> >
> > It is not guaranteed that alignof(freeptr_t) >= sizeof(freeptr_t)
> > (free_ptr is sort of a long). If freeptr_offset must be a multiple of
> > 4 or 8 bytes,
> > the code that assigns it must make sure that is true.
>
> Right, this is what the email is about...
>
> > I guess this is the code in fs/file_table.c:
> >
> >     .freeptr_offset = offsetof(struct file, f_freeptr),
> >
> > which references:
> >
> >     include/linux/fs.h:           freeptr_t               f_freeptr;
> >
> > I guess the simplest solution is to add an __aligned(sizeof(freeptr_t))
> > (or __aligned(sizeof(long)) to the definition of freeptr_t:
> >
> >     include/linux/slab.h:typedef struct { unsigned long v; } freeptr_t;
>
> It's not, it's struct io_kiocb->work, as per the stack trace in this
> email.

Sorry, I was falling out of thin air into this thread...

linux-next/master:io_uring/io_uring.c:          .freeptr_offset =
offsetof(struct io_kiocb, work),
linux-next/master:io_uring/io_uring.c:          .use_freeptr_offset = true,

Apparently io_kiocb.work is of type struct io_wq_work, not freeptr_t?
Isn't that a bit error-prone, as the slab core code expects a freeptr_t?

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- [email protected]

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 03/14] io_uring: specify freeptr usage for SLAB_TYPESAFE_BY_RCU io_kiocb cache
  2024-11-19 19:41                     ` Geert Uytterhoeven
@ 2024-11-19 19:44                       ` Jens Axboe
  2024-11-19 19:49                         ` Jens Axboe
  0 siblings, 1 reply; 36+ messages in thread
From: Jens Axboe @ 2024-11-19 19:44 UTC (permalink / raw)
  To: Geert Uytterhoeven; +Cc: Guenter Roeck, io-uring, linux-m68k

On 11/19/24 12:41 PM, Geert Uytterhoeven wrote:
> Hi Jens,
> 
> On Tue, Nov 19, 2024 at 8:30?PM Jens Axboe <[email protected]> wrote:
>> On 11/19/24 12:25 PM, Geert Uytterhoeven wrote:
>>> On Tue, Nov 19, 2024 at 8:10?PM Jens Axboe <[email protected]> wrote:
>>>> On 11/19/24 12:02 PM, Geert Uytterhoeven wrote:
>>>>> On Tue, Nov 19, 2024 at 8:00?PM Jens Axboe <[email protected]> wrote:
>>>>>> On 11/19/24 10:49 AM, Geert Uytterhoeven wrote:
>>>>>>> On Tue, Nov 19, 2024 at 5:21?PM Guenter Roeck <[email protected]> wrote:
>>>>>>>> On 11/19/24 08:02, Jens Axboe wrote:
>>>>>>>>> On 11/19/24 8:36 AM, Guenter Roeck wrote:
>>>>>>>>>> On Tue, Oct 29, 2024 at 09:16:32AM -0600, Jens Axboe wrote:
>>>>>>>>>>> Doesn't matter right now as there's still some bytes left for it, but
>>>>>>>>>>> let's prepare for the io_kiocb potentially growing and add a specific
>>>>>>>>>>> freeptr offset for it.
>>>>>>>>>>>
>>>>>>>>>>> Signed-off-by: Jens Axboe <[email protected]>
>>>>>>>>>>
>>>>>>>>>> This patch triggers:
>>>>>>>>>>
>>>>>>>>>> Kernel panic - not syncing: __kmem_cache_create_args: Failed to create slab 'io_kiocb'. Error -22
>>>>>>>>>> CPU: 0 UID: 0 PID: 1 Comm: swapper Not tainted 6.12.0-mac-00971-g158f238aa69d #1
>>>>>>>>>> Stack from 00c63e5c:
>>>>>>>>>>          00c63e5c 00612c1c 00612c1c 00000300 00000001 005f3ce6 004b9044 00612c1c
>>>>>>>>>>          004ae21e 00000310 000000b6 005f3ce6 005f3ce6 ffffffea ffffffea 00797244
>>>>>>>>>>          00c63f20 000c6974 005ee588 004c9051 005f3ce6 ffffffea 000000a5 00c614a0
>>>>>>>>>>          004a72c2 0002cb62 000c675e 004adb58 0076f28a 005f3ce6 000000b6 00c63ef4
>>>>>>>>>>          00000310 00c63ef4 00000000 00000016 0076f23e 00c63f4c 00000010 00000004
>>>>>>>>>>          00000038 0000009a 01000000 00000000 00000000 00000000 000020e0 0076f23e
>>>>>>>>>> Call Trace: [<004b9044>] dump_stack+0xc/0x10
>>>>>>>>>>   [<004ae21e>] panic+0xc4/0x252
>>>>>>>>>>   [<000c6974>] __kmem_cache_create_args+0x216/0x26c
>>>>>>>>>>   [<004a72c2>] strcpy+0x0/0x1c
>>>>>>>>>>   [<0002cb62>] parse_args+0x0/0x1f2
>>>>>>>>>>   [<000c675e>] __kmem_cache_create_args+0x0/0x26c
>>>>>>>>>>   [<004adb58>] memset+0x0/0x8c
>>>>>>>>>>   [<0076f28a>] io_uring_init+0x4c/0xca
>>>>>>>>>>   [<0076f23e>] io_uring_init+0x0/0xca
>>>>>>>>>>   [<000020e0>] do_one_initcall+0x32/0x192
>>>>>>>>>>   [<0076f23e>] io_uring_init+0x0/0xca
>>>>>>>>>>   [<0000211c>] do_one_initcall+0x6e/0x192
>>>>>>>>>>   [<004a72c2>] strcpy+0x0/0x1c
>>>>>>>>>>   [<0002cb62>] parse_args+0x0/0x1f2
>>>>>>>>>>   [<000020ae>] do_one_initcall+0x0/0x192
>>>>>>>>>>   [<0075c4e2>] kernel_init_freeable+0x1a0/0x1a4
>>>>>>>>>>   [<0076f23e>] io_uring_init+0x0/0xca
>>>>>>>>>>   [<004b911a>] kernel_init+0x0/0xec
>>>>>>>>>>   [<004b912e>] kernel_init+0x14/0xec
>>>>>>>>>>   [<004b911a>] kernel_init+0x0/0xec
>>>>>>>>>>   [<0000252c>] ret_from_kernel_thread+0xc/0x14
>>>>>>>>>>
>>>>>>>>>> when trying to boot the m68k:q800 machine in qemu.
>>>>>>>>>>
>>>>>>>>>> An added debug message in create_cache() shows the reason:
>>>>>>>>>>
>>>>>>>>>> #### freeptr_offset=154 object_size=182 flags=0x310 aligned=0 sizeof(freeptr_t)=4
>>>>>>>>>>
>>>>>>>>>> freeptr_offset would need to be 4-byte aligned but that is not the
>>>>>>>>>> case on m68k.
>>>>>>>>>
>>>>>>>>> Why is ->work 2-byte aligned to begin with on m68k?!
>>>>>>>>
>>>>>>>> My understanding is that m68k does not align pointers.
>>>>>>>
>>>>>>> The minimum alignment for multi-byte integral values on m68k is
>>>>>>> 2 bytes.
>>>>>>>
>>>>>>> See also the comment at
>>>>>>> https://elixir.bootlin.com/linux/v6.12/source/include/linux/maple_tree.h#L46
>>>>>>
>>>>>> Maybe it's time we put m68k to bed? :-)
>>>>>>
>>>>>> We can add a forced alignment ->work to be 4 bytes, won't change
>>>>>> anything on anything remotely current. But does feel pretty hacky to
>>>>>> need to align based on some ancient thing.
>>>>>
>>>>> Why does freeptr_offset need to be 4-byte aligned?
>>>>
>>>> Didn't check, but it's slab/slub complaining using a 2-byte aligned
>>>> address for the free pointer offset. It's explicitly checking:
>>>>
>>>>         /* If a custom freelist pointer is requested make sure it's sane. */
>>>>         err = -EINVAL;
>>>>         if (args->use_freeptr_offset &&
>>>>             (args->freeptr_offset >= object_size ||
>>>>              !(flags & SLAB_TYPESAFE_BY_RCU) ||
>>>>              !IS_ALIGNED(args->freeptr_offset, sizeof(freeptr_t))))
>>>>                 goto out;
>>>
>>> It is not guaranteed that alignof(freeptr_t) >= sizeof(freeptr_t)
>>> (free_ptr is sort of a long). If freeptr_offset must be a multiple of
>>> 4 or 8 bytes,
>>> the code that assigns it must make sure that is true.
>>
>> Right, this is what the email is about...
>>
>>> I guess this is the code in fs/file_table.c:
>>>
>>>     .freeptr_offset = offsetof(struct file, f_freeptr),
>>>
>>> which references:
>>>
>>>     include/linux/fs.h:           freeptr_t               f_freeptr;
>>>
>>> I guess the simplest solution is to add an __aligned(sizeof(freeptr_t))
>>> (or __aligned(sizeof(long)) to the definition of freeptr_t:
>>>
>>>     include/linux/slab.h:typedef struct { unsigned long v; } freeptr_t;
>>
>> It's not, it's struct io_kiocb->work, as per the stack trace in this
>> email.
> 
> Sorry, I was falling out of thin air into this thread...
> 
> linux-next/master:io_uring/io_uring.c:          .freeptr_offset =
> offsetof(struct io_kiocb, work),
> linux-next/master:io_uring/io_uring.c:          .use_freeptr_offset = true,
> 
> Apparently io_kiocb.work is of type struct io_wq_work, not freeptr_t?
> Isn't that a bit error-prone, as the slab core code expects a freeptr_t?

It just needs the space, should not matter otherwise. But may as well
just add the union and align the freeptr so it stop complaining on m68k.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 03/14] io_uring: specify freeptr usage for SLAB_TYPESAFE_BY_RCU io_kiocb cache
  2024-11-19 19:44                       ` Jens Axboe
@ 2024-11-19 19:49                         ` Jens Axboe
  2024-11-19 21:46                           ` Guenter Roeck
  0 siblings, 1 reply; 36+ messages in thread
From: Jens Axboe @ 2024-11-19 19:49 UTC (permalink / raw)
  To: Geert Uytterhoeven; +Cc: Guenter Roeck, io-uring, linux-m68k

On 11/19/24 12:44 PM, Jens Axboe wrote:
> On 11/19/24 12:41 PM, Geert Uytterhoeven wrote:
>> Hi Jens,
>>
>> On Tue, Nov 19, 2024 at 8:30?PM Jens Axboe <[email protected]> wrote:
>>> On 11/19/24 12:25 PM, Geert Uytterhoeven wrote:
>>>> On Tue, Nov 19, 2024 at 8:10?PM Jens Axboe <[email protected]> wrote:
>>>>> On 11/19/24 12:02 PM, Geert Uytterhoeven wrote:
>>>>>> On Tue, Nov 19, 2024 at 8:00?PM Jens Axboe <[email protected]> wrote:
>>>>>>> On 11/19/24 10:49 AM, Geert Uytterhoeven wrote:
>>>>>>>> On Tue, Nov 19, 2024 at 5:21?PM Guenter Roeck <[email protected]> wrote:
>>>>>>>>> On 11/19/24 08:02, Jens Axboe wrote:
>>>>>>>>>> On 11/19/24 8:36 AM, Guenter Roeck wrote:
>>>>>>>>>>> On Tue, Oct 29, 2024 at 09:16:32AM -0600, Jens Axboe wrote:
>>>>>>>>>>>> Doesn't matter right now as there's still some bytes left for it, but
>>>>>>>>>>>> let's prepare for the io_kiocb potentially growing and add a specific
>>>>>>>>>>>> freeptr offset for it.
>>>>>>>>>>>>
>>>>>>>>>>>> Signed-off-by: Jens Axboe <[email protected]>
>>>>>>>>>>>
>>>>>>>>>>> This patch triggers:
>>>>>>>>>>>
>>>>>>>>>>> Kernel panic - not syncing: __kmem_cache_create_args: Failed to create slab 'io_kiocb'. Error -22
>>>>>>>>>>> CPU: 0 UID: 0 PID: 1 Comm: swapper Not tainted 6.12.0-mac-00971-g158f238aa69d #1
>>>>>>>>>>> Stack from 00c63e5c:
>>>>>>>>>>>          00c63e5c 00612c1c 00612c1c 00000300 00000001 005f3ce6 004b9044 00612c1c
>>>>>>>>>>>          004ae21e 00000310 000000b6 005f3ce6 005f3ce6 ffffffea ffffffea 00797244
>>>>>>>>>>>          00c63f20 000c6974 005ee588 004c9051 005f3ce6 ffffffea 000000a5 00c614a0
>>>>>>>>>>>          004a72c2 0002cb62 000c675e 004adb58 0076f28a 005f3ce6 000000b6 00c63ef4
>>>>>>>>>>>          00000310 00c63ef4 00000000 00000016 0076f23e 00c63f4c 00000010 00000004
>>>>>>>>>>>          00000038 0000009a 01000000 00000000 00000000 00000000 000020e0 0076f23e
>>>>>>>>>>> Call Trace: [<004b9044>] dump_stack+0xc/0x10
>>>>>>>>>>>   [<004ae21e>] panic+0xc4/0x252
>>>>>>>>>>>   [<000c6974>] __kmem_cache_create_args+0x216/0x26c
>>>>>>>>>>>   [<004a72c2>] strcpy+0x0/0x1c
>>>>>>>>>>>   [<0002cb62>] parse_args+0x0/0x1f2
>>>>>>>>>>>   [<000c675e>] __kmem_cache_create_args+0x0/0x26c
>>>>>>>>>>>   [<004adb58>] memset+0x0/0x8c
>>>>>>>>>>>   [<0076f28a>] io_uring_init+0x4c/0xca
>>>>>>>>>>>   [<0076f23e>] io_uring_init+0x0/0xca
>>>>>>>>>>>   [<000020e0>] do_one_initcall+0x32/0x192
>>>>>>>>>>>   [<0076f23e>] io_uring_init+0x0/0xca
>>>>>>>>>>>   [<0000211c>] do_one_initcall+0x6e/0x192
>>>>>>>>>>>   [<004a72c2>] strcpy+0x0/0x1c
>>>>>>>>>>>   [<0002cb62>] parse_args+0x0/0x1f2
>>>>>>>>>>>   [<000020ae>] do_one_initcall+0x0/0x192
>>>>>>>>>>>   [<0075c4e2>] kernel_init_freeable+0x1a0/0x1a4
>>>>>>>>>>>   [<0076f23e>] io_uring_init+0x0/0xca
>>>>>>>>>>>   [<004b911a>] kernel_init+0x0/0xec
>>>>>>>>>>>   [<004b912e>] kernel_init+0x14/0xec
>>>>>>>>>>>   [<004b911a>] kernel_init+0x0/0xec
>>>>>>>>>>>   [<0000252c>] ret_from_kernel_thread+0xc/0x14
>>>>>>>>>>>
>>>>>>>>>>> when trying to boot the m68k:q800 machine in qemu.
>>>>>>>>>>>
>>>>>>>>>>> An added debug message in create_cache() shows the reason:
>>>>>>>>>>>
>>>>>>>>>>> #### freeptr_offset=154 object_size=182 flags=0x310 aligned=0 sizeof(freeptr_t)=4
>>>>>>>>>>>
>>>>>>>>>>> freeptr_offset would need to be 4-byte aligned but that is not the
>>>>>>>>>>> case on m68k.
>>>>>>>>>>
>>>>>>>>>> Why is ->work 2-byte aligned to begin with on m68k?!
>>>>>>>>>
>>>>>>>>> My understanding is that m68k does not align pointers.
>>>>>>>>
>>>>>>>> The minimum alignment for multi-byte integral values on m68k is
>>>>>>>> 2 bytes.
>>>>>>>>
>>>>>>>> See also the comment at
>>>>>>>> https://elixir.bootlin.com/linux/v6.12/source/include/linux/maple_tree.h#L46
>>>>>>>
>>>>>>> Maybe it's time we put m68k to bed? :-)
>>>>>>>
>>>>>>> We can add a forced alignment ->work to be 4 bytes, won't change
>>>>>>> anything on anything remotely current. But does feel pretty hacky to
>>>>>>> need to align based on some ancient thing.
>>>>>>
>>>>>> Why does freeptr_offset need to be 4-byte aligned?
>>>>>
>>>>> Didn't check, but it's slab/slub complaining using a 2-byte aligned
>>>>> address for the free pointer offset. It's explicitly checking:
>>>>>
>>>>>         /* If a custom freelist pointer is requested make sure it's sane. */
>>>>>         err = -EINVAL;
>>>>>         if (args->use_freeptr_offset &&
>>>>>             (args->freeptr_offset >= object_size ||
>>>>>              !(flags & SLAB_TYPESAFE_BY_RCU) ||
>>>>>              !IS_ALIGNED(args->freeptr_offset, sizeof(freeptr_t))))
>>>>>                 goto out;
>>>>
>>>> It is not guaranteed that alignof(freeptr_t) >= sizeof(freeptr_t)
>>>> (free_ptr is sort of a long). If freeptr_offset must be a multiple of
>>>> 4 or 8 bytes,
>>>> the code that assigns it must make sure that is true.
>>>
>>> Right, this is what the email is about...
>>>
>>>> I guess this is the code in fs/file_table.c:
>>>>
>>>>     .freeptr_offset = offsetof(struct file, f_freeptr),
>>>>
>>>> which references:
>>>>
>>>>     include/linux/fs.h:           freeptr_t               f_freeptr;
>>>>
>>>> I guess the simplest solution is to add an __aligned(sizeof(freeptr_t))
>>>> (or __aligned(sizeof(long)) to the definition of freeptr_t:
>>>>
>>>>     include/linux/slab.h:typedef struct { unsigned long v; } freeptr_t;
>>>
>>> It's not, it's struct io_kiocb->work, as per the stack trace in this
>>> email.
>>
>> Sorry, I was falling out of thin air into this thread...
>>
>> linux-next/master:io_uring/io_uring.c:          .freeptr_offset =
>> offsetof(struct io_kiocb, work),
>> linux-next/master:io_uring/io_uring.c:          .use_freeptr_offset = true,
>>
>> Apparently io_kiocb.work is of type struct io_wq_work, not freeptr_t?
>> Isn't that a bit error-prone, as the slab core code expects a freeptr_t?
> 
> It just needs the space, should not matter otherwise. But may as well
> just add the union and align the freeptr so it stop complaining on m68k.

Ala the below, perhaps alignment takes care of itself then?


diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 593c10a02144..a83ec7f7849d 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -674,7 +674,11 @@ struct io_kiocb {
 	struct io_kiocb			*link;
 	/* custom credentials, valid IFF REQ_F_CREDS is set */
 	const struct cred		*creds;
-	struct io_wq_work		work;
+
+	union {
+		struct io_wq_work	work;
+		freeptr_t		freeptr;
+	};
 
 	struct {
 		u64			extra1;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 73af59863300..86ac7df2a601 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3812,7 +3812,7 @@ static int __init io_uring_init(void)
 	struct kmem_cache_args kmem_args = {
 		.useroffset = offsetof(struct io_kiocb, cmd.data),
 		.usersize = sizeof_field(struct io_kiocb, cmd.data),
-		.freeptr_offset = offsetof(struct io_kiocb, work),
+		.freeptr_offset = offsetof(struct io_kiocb, freeptr),
 		.use_freeptr_offset = true,
 	};
 

-- 
Jens Axboe

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH 03/14] io_uring: specify freeptr usage for SLAB_TYPESAFE_BY_RCU io_kiocb cache
  2024-11-19 19:49                         ` Jens Axboe
@ 2024-11-19 21:46                           ` Guenter Roeck
  2024-11-19 22:30                             ` Jens Axboe
  0 siblings, 1 reply; 36+ messages in thread
From: Guenter Roeck @ 2024-11-19 21:46 UTC (permalink / raw)
  To: Jens Axboe, Geert Uytterhoeven; +Cc: io-uring, linux-m68k

On 11/19/24 11:49, Jens Axboe wrote:
> On 11/19/24 12:44 PM, Jens Axboe wrote:
>> On 11/19/24 12:41 PM, Geert Uytterhoeven wrote:
>>> Hi Jens,
>>>
>>> On Tue, Nov 19, 2024 at 8:30?PM Jens Axboe <[email protected]> wrote:
>>>> On 11/19/24 12:25 PM, Geert Uytterhoeven wrote:
>>>>> On Tue, Nov 19, 2024 at 8:10?PM Jens Axboe <[email protected]> wrote:
>>>>>> On 11/19/24 12:02 PM, Geert Uytterhoeven wrote:
>>>>>>> On Tue, Nov 19, 2024 at 8:00?PM Jens Axboe <[email protected]> wrote:
>>>>>>>> On 11/19/24 10:49 AM, Geert Uytterhoeven wrote:
>>>>>>>>> On Tue, Nov 19, 2024 at 5:21?PM Guenter Roeck <[email protected]> wrote:
>>>>>>>>>> On 11/19/24 08:02, Jens Axboe wrote:
>>>>>>>>>>> On 11/19/24 8:36 AM, Guenter Roeck wrote:
>>>>>>>>>>>> On Tue, Oct 29, 2024 at 09:16:32AM -0600, Jens Axboe wrote:
>>>>>>>>>>>>> Doesn't matter right now as there's still some bytes left for it, but
>>>>>>>>>>>>> let's prepare for the io_kiocb potentially growing and add a specific
>>>>>>>>>>>>> freeptr offset for it.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Signed-off-by: Jens Axboe <[email protected]>
>>>>>>>>>>>>
>>>>>>>>>>>> This patch triggers:
>>>>>>>>>>>>
>>>>>>>>>>>> Kernel panic - not syncing: __kmem_cache_create_args: Failed to create slab 'io_kiocb'. Error -22
>>>>>>>>>>>> CPU: 0 UID: 0 PID: 1 Comm: swapper Not tainted 6.12.0-mac-00971-g158f238aa69d #1
>>>>>>>>>>>> Stack from 00c63e5c:
>>>>>>>>>>>>           00c63e5c 00612c1c 00612c1c 00000300 00000001 005f3ce6 004b9044 00612c1c
>>>>>>>>>>>>           004ae21e 00000310 000000b6 005f3ce6 005f3ce6 ffffffea ffffffea 00797244
>>>>>>>>>>>>           00c63f20 000c6974 005ee588 004c9051 005f3ce6 ffffffea 000000a5 00c614a0
>>>>>>>>>>>>           004a72c2 0002cb62 000c675e 004adb58 0076f28a 005f3ce6 000000b6 00c63ef4
>>>>>>>>>>>>           00000310 00c63ef4 00000000 00000016 0076f23e 00c63f4c 00000010 00000004
>>>>>>>>>>>>           00000038 0000009a 01000000 00000000 00000000 00000000 000020e0 0076f23e
>>>>>>>>>>>> Call Trace: [<004b9044>] dump_stack+0xc/0x10
>>>>>>>>>>>>    [<004ae21e>] panic+0xc4/0x252
>>>>>>>>>>>>    [<000c6974>] __kmem_cache_create_args+0x216/0x26c
>>>>>>>>>>>>    [<004a72c2>] strcpy+0x0/0x1c
>>>>>>>>>>>>    [<0002cb62>] parse_args+0x0/0x1f2
>>>>>>>>>>>>    [<000c675e>] __kmem_cache_create_args+0x0/0x26c
>>>>>>>>>>>>    [<004adb58>] memset+0x0/0x8c
>>>>>>>>>>>>    [<0076f28a>] io_uring_init+0x4c/0xca
>>>>>>>>>>>>    [<0076f23e>] io_uring_init+0x0/0xca
>>>>>>>>>>>>    [<000020e0>] do_one_initcall+0x32/0x192
>>>>>>>>>>>>    [<0076f23e>] io_uring_init+0x0/0xca
>>>>>>>>>>>>    [<0000211c>] do_one_initcall+0x6e/0x192
>>>>>>>>>>>>    [<004a72c2>] strcpy+0x0/0x1c
>>>>>>>>>>>>    [<0002cb62>] parse_args+0x0/0x1f2
>>>>>>>>>>>>    [<000020ae>] do_one_initcall+0x0/0x192
>>>>>>>>>>>>    [<0075c4e2>] kernel_init_freeable+0x1a0/0x1a4
>>>>>>>>>>>>    [<0076f23e>] io_uring_init+0x0/0xca
>>>>>>>>>>>>    [<004b911a>] kernel_init+0x0/0xec
>>>>>>>>>>>>    [<004b912e>] kernel_init+0x14/0xec
>>>>>>>>>>>>    [<004b911a>] kernel_init+0x0/0xec
>>>>>>>>>>>>    [<0000252c>] ret_from_kernel_thread+0xc/0x14
>>>>>>>>>>>>
>>>>>>>>>>>> when trying to boot the m68k:q800 machine in qemu.
>>>>>>>>>>>>
>>>>>>>>>>>> An added debug message in create_cache() shows the reason:
>>>>>>>>>>>>
>>>>>>>>>>>> #### freeptr_offset=154 object_size=182 flags=0x310 aligned=0 sizeof(freeptr_t)=4
>>>>>>>>>>>>
>>>>>>>>>>>> freeptr_offset would need to be 4-byte aligned but that is not the
>>>>>>>>>>>> case on m68k.
>>>>>>>>>>>
>>>>>>>>>>> Why is ->work 2-byte aligned to begin with on m68k?!
>>>>>>>>>>
>>>>>>>>>> My understanding is that m68k does not align pointers.
>>>>>>>>>
>>>>>>>>> The minimum alignment for multi-byte integral values on m68k is
>>>>>>>>> 2 bytes.
>>>>>>>>>
>>>>>>>>> See also the comment at
>>>>>>>>> https://elixir.bootlin.com/linux/v6.12/source/include/linux/maple_tree.h#L46
>>>>>>>>
>>>>>>>> Maybe it's time we put m68k to bed? :-)
>>>>>>>>
>>>>>>>> We can add a forced alignment ->work to be 4 bytes, won't change
>>>>>>>> anything on anything remotely current. But does feel pretty hacky to
>>>>>>>> need to align based on some ancient thing.
>>>>>>>
>>>>>>> Why does freeptr_offset need to be 4-byte aligned?
>>>>>>
>>>>>> Didn't check, but it's slab/slub complaining using a 2-byte aligned
>>>>>> address for the free pointer offset. It's explicitly checking:
>>>>>>
>>>>>>          /* If a custom freelist pointer is requested make sure it's sane. */
>>>>>>          err = -EINVAL;
>>>>>>          if (args->use_freeptr_offset &&
>>>>>>              (args->freeptr_offset >= object_size ||
>>>>>>               !(flags & SLAB_TYPESAFE_BY_RCU) ||
>>>>>>               !IS_ALIGNED(args->freeptr_offset, sizeof(freeptr_t))))
>>>>>>                  goto out;
>>>>>
>>>>> It is not guaranteed that alignof(freeptr_t) >= sizeof(freeptr_t)
>>>>> (free_ptr is sort of a long). If freeptr_offset must be a multiple of
>>>>> 4 or 8 bytes,
>>>>> the code that assigns it must make sure that is true.
>>>>
>>>> Right, this is what the email is about...
>>>>
>>>>> I guess this is the code in fs/file_table.c:
>>>>>
>>>>>      .freeptr_offset = offsetof(struct file, f_freeptr),
>>>>>
>>>>> which references:
>>>>>
>>>>>      include/linux/fs.h:           freeptr_t               f_freeptr;
>>>>>
>>>>> I guess the simplest solution is to add an __aligned(sizeof(freeptr_t))
>>>>> (or __aligned(sizeof(long)) to the definition of freeptr_t:
>>>>>
>>>>>      include/linux/slab.h:typedef struct { unsigned long v; } freeptr_t;
>>>>
>>>> It's not, it's struct io_kiocb->work, as per the stack trace in this
>>>> email.
>>>
>>> Sorry, I was falling out of thin air into this thread...
>>>
>>> linux-next/master:io_uring/io_uring.c:          .freeptr_offset =
>>> offsetof(struct io_kiocb, work),
>>> linux-next/master:io_uring/io_uring.c:          .use_freeptr_offset = true,
>>>
>>> Apparently io_kiocb.work is of type struct io_wq_work, not freeptr_t?
>>> Isn't that a bit error-prone, as the slab core code expects a freeptr_t?
>>
>> It just needs the space, should not matter otherwise. But may as well
>> just add the union and align the freeptr so it stop complaining on m68k.
> 
> Ala the below, perhaps alignment takes care of itself then?
> 

No, that doesn't work (I tried), at least not on its own, because the pointer
is still unaligned on m68k.

Guenter

> 
> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
> index 593c10a02144..a83ec7f7849d 100644
> --- a/include/linux/io_uring_types.h
> +++ b/include/linux/io_uring_types.h
> @@ -674,7 +674,11 @@ struct io_kiocb {
>   	struct io_kiocb			*link;
>   	/* custom credentials, valid IFF REQ_F_CREDS is set */
>   	const struct cred		*creds;
> -	struct io_wq_work		work;
> +
> +	union {
> +		struct io_wq_work	work;
> +		freeptr_t		freeptr;
> +	};
>   
>   	struct {
>   		u64			extra1;
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index 73af59863300..86ac7df2a601 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -3812,7 +3812,7 @@ static int __init io_uring_init(void)
>   	struct kmem_cache_args kmem_args = {
>   		.useroffset = offsetof(struct io_kiocb, cmd.data),
>   		.usersize = sizeof_field(struct io_kiocb, cmd.data),
> -		.freeptr_offset = offsetof(struct io_kiocb, work),
> +		.freeptr_offset = offsetof(struct io_kiocb, freeptr),
>   		.use_freeptr_offset = true,
>   	};
>   
> 


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 03/14] io_uring: specify freeptr usage for SLAB_TYPESAFE_BY_RCU io_kiocb cache
  2024-11-19 21:46                           ` Guenter Roeck
@ 2024-11-19 22:30                             ` Jens Axboe
  2024-11-20  0:08                               ` Guenter Roeck
  2024-11-20  8:19                               ` Geert Uytterhoeven
  0 siblings, 2 replies; 36+ messages in thread
From: Jens Axboe @ 2024-11-19 22:30 UTC (permalink / raw)
  To: Guenter Roeck, Geert Uytterhoeven; +Cc: io-uring, linux-m68k

On 11/19/24 2:46 PM, Guenter Roeck wrote:
> On 11/19/24 11:49, Jens Axboe wrote:
>> On 11/19/24 12:44 PM, Jens Axboe wrote:
>>> On 11/19/24 12:41 PM, Geert Uytterhoeven wrote:
>>>> Hi Jens,
>>>>
>>>> On Tue, Nov 19, 2024 at 8:30?PM Jens Axboe <[email protected]> wrote:
>>>>> On 11/19/24 12:25 PM, Geert Uytterhoeven wrote:
>>>>>> On Tue, Nov 19, 2024 at 8:10?PM Jens Axboe <[email protected]> wrote:
>>>>>>> On 11/19/24 12:02 PM, Geert Uytterhoeven wrote:
>>>>>>>> On Tue, Nov 19, 2024 at 8:00?PM Jens Axboe <[email protected]> wrote:
>>>>>>>>> On 11/19/24 10:49 AM, Geert Uytterhoeven wrote:
>>>>>>>>>> On Tue, Nov 19, 2024 at 5:21?PM Guenter Roeck <[email protected]> wrote:
>>>>>>>>>>> On 11/19/24 08:02, Jens Axboe wrote:
>>>>>>>>>>>> On 11/19/24 8:36 AM, Guenter Roeck wrote:
>>>>>>>>>>>>> On Tue, Oct 29, 2024 at 09:16:32AM -0600, Jens Axboe wrote:
>>>>>>>>>>>>>> Doesn't matter right now as there's still some bytes left for it, but
>>>>>>>>>>>>>> let's prepare for the io_kiocb potentially growing and add a specific
>>>>>>>>>>>>>> freeptr offset for it.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Signed-off-by: Jens Axboe <[email protected]>
>>>>>>>>>>>>>
>>>>>>>>>>>>> This patch triggers:
>>>>>>>>>>>>>
>>>>>>>>>>>>> Kernel panic - not syncing: __kmem_cache_create_args: Failed to create slab 'io_kiocb'. Error -22
>>>>>>>>>>>>> CPU: 0 UID: 0 PID: 1 Comm: swapper Not tainted 6.12.0-mac-00971-g158f238aa69d #1
>>>>>>>>>>>>> Stack from 00c63e5c:
>>>>>>>>>>>>>           00c63e5c 00612c1c 00612c1c 00000300 00000001 005f3ce6 004b9044 00612c1c
>>>>>>>>>>>>>           004ae21e 00000310 000000b6 005f3ce6 005f3ce6 ffffffea ffffffea 00797244
>>>>>>>>>>>>>           00c63f20 000c6974 005ee588 004c9051 005f3ce6 ffffffea 000000a5 00c614a0
>>>>>>>>>>>>>           004a72c2 0002cb62 000c675e 004adb58 0076f28a 005f3ce6 000000b6 00c63ef4
>>>>>>>>>>>>>           00000310 00c63ef4 00000000 00000016 0076f23e 00c63f4c 00000010 00000004
>>>>>>>>>>>>>           00000038 0000009a 01000000 00000000 00000000 00000000 000020e0 0076f23e
>>>>>>>>>>>>> Call Trace: [<004b9044>] dump_stack+0xc/0x10
>>>>>>>>>>>>>    [<004ae21e>] panic+0xc4/0x252
>>>>>>>>>>>>>    [<000c6974>] __kmem_cache_create_args+0x216/0x26c
>>>>>>>>>>>>>    [<004a72c2>] strcpy+0x0/0x1c
>>>>>>>>>>>>>    [<0002cb62>] parse_args+0x0/0x1f2
>>>>>>>>>>>>>    [<000c675e>] __kmem_cache_create_args+0x0/0x26c
>>>>>>>>>>>>>    [<004adb58>] memset+0x0/0x8c
>>>>>>>>>>>>>    [<0076f28a>] io_uring_init+0x4c/0xca
>>>>>>>>>>>>>    [<0076f23e>] io_uring_init+0x0/0xca
>>>>>>>>>>>>>    [<000020e0>] do_one_initcall+0x32/0x192
>>>>>>>>>>>>>    [<0076f23e>] io_uring_init+0x0/0xca
>>>>>>>>>>>>>    [<0000211c>] do_one_initcall+0x6e/0x192
>>>>>>>>>>>>>    [<004a72c2>] strcpy+0x0/0x1c
>>>>>>>>>>>>>    [<0002cb62>] parse_args+0x0/0x1f2
>>>>>>>>>>>>>    [<000020ae>] do_one_initcall+0x0/0x192
>>>>>>>>>>>>>    [<0075c4e2>] kernel_init_freeable+0x1a0/0x1a4
>>>>>>>>>>>>>    [<0076f23e>] io_uring_init+0x0/0xca
>>>>>>>>>>>>>    [<004b911a>] kernel_init+0x0/0xec
>>>>>>>>>>>>>    [<004b912e>] kernel_init+0x14/0xec
>>>>>>>>>>>>>    [<004b911a>] kernel_init+0x0/0xec
>>>>>>>>>>>>>    [<0000252c>] ret_from_kernel_thread+0xc/0x14
>>>>>>>>>>>>>
>>>>>>>>>>>>> when trying to boot the m68k:q800 machine in qemu.
>>>>>>>>>>>>>
>>>>>>>>>>>>> An added debug message in create_cache() shows the reason:
>>>>>>>>>>>>>
>>>>>>>>>>>>> #### freeptr_offset=154 object_size=182 flags=0x310 aligned=0 sizeof(freeptr_t)=4
>>>>>>>>>>>>>
>>>>>>>>>>>>> freeptr_offset would need to be 4-byte aligned but that is not the
>>>>>>>>>>>>> case on m68k.
>>>>>>>>>>>>
>>>>>>>>>>>> Why is ->work 2-byte aligned to begin with on m68k?!
>>>>>>>>>>>
>>>>>>>>>>> My understanding is that m68k does not align pointers.
>>>>>>>>>>
>>>>>>>>>> The minimum alignment for multi-byte integral values on m68k is
>>>>>>>>>> 2 bytes.
>>>>>>>>>>
>>>>>>>>>> See also the comment at
>>>>>>>>>> https://elixir.bootlin.com/linux/v6.12/source/include/linux/maple_tree.h#L46
>>>>>>>>>
>>>>>>>>> Maybe it's time we put m68k to bed? :-)
>>>>>>>>>
>>>>>>>>> We can add a forced alignment ->work to be 4 bytes, won't change
>>>>>>>>> anything on anything remotely current. But does feel pretty hacky to
>>>>>>>>> need to align based on some ancient thing.
>>>>>>>>
>>>>>>>> Why does freeptr_offset need to be 4-byte aligned?
>>>>>>>
>>>>>>> Didn't check, but it's slab/slub complaining using a 2-byte aligned
>>>>>>> address for the free pointer offset. It's explicitly checking:
>>>>>>>
>>>>>>>          /* If a custom freelist pointer is requested make sure it's sane. */
>>>>>>>          err = -EINVAL;
>>>>>>>          if (args->use_freeptr_offset &&
>>>>>>>              (args->freeptr_offset >= object_size ||
>>>>>>>               !(flags & SLAB_TYPESAFE_BY_RCU) ||
>>>>>>>               !IS_ALIGNED(args->freeptr_offset, sizeof(freeptr_t))))
>>>>>>>                  goto out;
>>>>>>
>>>>>> It is not guaranteed that alignof(freeptr_t) >= sizeof(freeptr_t)
>>>>>> (free_ptr is sort of a long). If freeptr_offset must be a multiple of
>>>>>> 4 or 8 bytes,
>>>>>> the code that assigns it must make sure that is true.
>>>>>
>>>>> Right, this is what the email is about...
>>>>>
>>>>>> I guess this is the code in fs/file_table.c:
>>>>>>
>>>>>>      .freeptr_offset = offsetof(struct file, f_freeptr),
>>>>>>
>>>>>> which references:
>>>>>>
>>>>>>      include/linux/fs.h:           freeptr_t               f_freeptr;
>>>>>>
>>>>>> I guess the simplest solution is to add an __aligned(sizeof(freeptr_t))
>>>>>> (or __aligned(sizeof(long)) to the definition of freeptr_t:
>>>>>>
>>>>>>      include/linux/slab.h:typedef struct { unsigned long v; } freeptr_t;
>>>>>
>>>>> It's not, it's struct io_kiocb->work, as per the stack trace in this
>>>>> email.
>>>>
>>>> Sorry, I was falling out of thin air into this thread...
>>>>
>>>> linux-next/master:io_uring/io_uring.c:          .freeptr_offset =
>>>> offsetof(struct io_kiocb, work),
>>>> linux-next/master:io_uring/io_uring.c:          .use_freeptr_offset = true,
>>>>
>>>> Apparently io_kiocb.work is of type struct io_wq_work, not freeptr_t?
>>>> Isn't that a bit error-prone, as the slab core code expects a freeptr_t?
>>>
>>> It just needs the space, should not matter otherwise. But may as well
>>> just add the union and align the freeptr so it stop complaining on m68k.
>>
>> Ala the below, perhaps alignment takes care of itself then?
>>
> 
> No, that doesn't work (I tried), at least not on its own, because the pointer
> is still unaligned on m68k.

Yeah we'll likely need to force it. The below should work, I pressume?
Feels pretty odd to have to align it to the size of it, when that should
naturally occur... Crusty legacy archs.

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 593c10a02144..8ed9c6923668 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -674,7 +674,11 @@ struct io_kiocb {
 	struct io_kiocb			*link;
 	/* custom credentials, valid IFF REQ_F_CREDS is set */
 	const struct cred		*creds;
-	struct io_wq_work		work;
+
+	union {
+		struct io_wq_work	work;
+		freeptr_t		freeptr __aligned(sizeof(freeptr_t));
+	};
 
 	struct {
 		u64			extra1;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 73af59863300..86ac7df2a601 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3812,7 +3812,7 @@ static int __init io_uring_init(void)
 	struct kmem_cache_args kmem_args = {
 		.useroffset = offsetof(struct io_kiocb, cmd.data),
 		.usersize = sizeof_field(struct io_kiocb, cmd.data),
-		.freeptr_offset = offsetof(struct io_kiocb, work),
+		.freeptr_offset = offsetof(struct io_kiocb, freeptr),
 		.use_freeptr_offset = true,
 	};
 
-- 
Jens Axboe

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH 03/14] io_uring: specify freeptr usage for SLAB_TYPESAFE_BY_RCU io_kiocb cache
  2024-11-19 22:30                             ` Jens Axboe
@ 2024-11-20  0:08                               ` Guenter Roeck
  2024-11-20  1:58                                 ` Jens Axboe
  2024-11-20  8:19                               ` Geert Uytterhoeven
  1 sibling, 1 reply; 36+ messages in thread
From: Guenter Roeck @ 2024-11-20  0:08 UTC (permalink / raw)
  To: Jens Axboe, Geert Uytterhoeven; +Cc: io-uring, linux-m68k

On 11/19/24 14:30, Jens Axboe wrote:
> On 11/19/24 2:46 PM, Guenter Roeck wrote:
>> On 11/19/24 11:49, Jens Axboe wrote:
>>> On 11/19/24 12:44 PM, Jens Axboe wrote:
>>>> On 11/19/24 12:41 PM, Geert Uytterhoeven wrote:
>>>>> Hi Jens,
>>>>>
>>>>> On Tue, Nov 19, 2024 at 8:30?PM Jens Axboe <[email protected]> wrote:
>>>>>> On 11/19/24 12:25 PM, Geert Uytterhoeven wrote:
>>>>>>> On Tue, Nov 19, 2024 at 8:10?PM Jens Axboe <[email protected]> wrote:
>>>>>>>> On 11/19/24 12:02 PM, Geert Uytterhoeven wrote:
>>>>>>>>> On Tue, Nov 19, 2024 at 8:00?PM Jens Axboe <[email protected]> wrote:
>>>>>>>>>> On 11/19/24 10:49 AM, Geert Uytterhoeven wrote:
>>>>>>>>>>> On Tue, Nov 19, 2024 at 5:21?PM Guenter Roeck <[email protected]> wrote:
>>>>>>>>>>>> On 11/19/24 08:02, Jens Axboe wrote:
>>>>>>>>>>>>> On 11/19/24 8:36 AM, Guenter Roeck wrote:
>>>>>>>>>>>>>> On Tue, Oct 29, 2024 at 09:16:32AM -0600, Jens Axboe wrote:
>>>>>>>>>>>>>>> Doesn't matter right now as there's still some bytes left for it, but
>>>>>>>>>>>>>>> let's prepare for the io_kiocb potentially growing and add a specific
>>>>>>>>>>>>>>> freeptr offset for it.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Signed-off-by: Jens Axboe <[email protected]>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> This patch triggers:
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Kernel panic - not syncing: __kmem_cache_create_args: Failed to create slab 'io_kiocb'. Error -22
>>>>>>>>>>>>>> CPU: 0 UID: 0 PID: 1 Comm: swapper Not tainted 6.12.0-mac-00971-g158f238aa69d #1
>>>>>>>>>>>>>> Stack from 00c63e5c:
>>>>>>>>>>>>>>            00c63e5c 00612c1c 00612c1c 00000300 00000001 005f3ce6 004b9044 00612c1c
>>>>>>>>>>>>>>            004ae21e 00000310 000000b6 005f3ce6 005f3ce6 ffffffea ffffffea 00797244
>>>>>>>>>>>>>>            00c63f20 000c6974 005ee588 004c9051 005f3ce6 ffffffea 000000a5 00c614a0
>>>>>>>>>>>>>>            004a72c2 0002cb62 000c675e 004adb58 0076f28a 005f3ce6 000000b6 00c63ef4
>>>>>>>>>>>>>>            00000310 00c63ef4 00000000 00000016 0076f23e 00c63f4c 00000010 00000004
>>>>>>>>>>>>>>            00000038 0000009a 01000000 00000000 00000000 00000000 000020e0 0076f23e
>>>>>>>>>>>>>> Call Trace: [<004b9044>] dump_stack+0xc/0x10
>>>>>>>>>>>>>>     [<004ae21e>] panic+0xc4/0x252
>>>>>>>>>>>>>>     [<000c6974>] __kmem_cache_create_args+0x216/0x26c
>>>>>>>>>>>>>>     [<004a72c2>] strcpy+0x0/0x1c
>>>>>>>>>>>>>>     [<0002cb62>] parse_args+0x0/0x1f2
>>>>>>>>>>>>>>     [<000c675e>] __kmem_cache_create_args+0x0/0x26c
>>>>>>>>>>>>>>     [<004adb58>] memset+0x0/0x8c
>>>>>>>>>>>>>>     [<0076f28a>] io_uring_init+0x4c/0xca
>>>>>>>>>>>>>>     [<0076f23e>] io_uring_init+0x0/0xca
>>>>>>>>>>>>>>     [<000020e0>] do_one_initcall+0x32/0x192
>>>>>>>>>>>>>>     [<0076f23e>] io_uring_init+0x0/0xca
>>>>>>>>>>>>>>     [<0000211c>] do_one_initcall+0x6e/0x192
>>>>>>>>>>>>>>     [<004a72c2>] strcpy+0x0/0x1c
>>>>>>>>>>>>>>     [<0002cb62>] parse_args+0x0/0x1f2
>>>>>>>>>>>>>>     [<000020ae>] do_one_initcall+0x0/0x192
>>>>>>>>>>>>>>     [<0075c4e2>] kernel_init_freeable+0x1a0/0x1a4
>>>>>>>>>>>>>>     [<0076f23e>] io_uring_init+0x0/0xca
>>>>>>>>>>>>>>     [<004b911a>] kernel_init+0x0/0xec
>>>>>>>>>>>>>>     [<004b912e>] kernel_init+0x14/0xec
>>>>>>>>>>>>>>     [<004b911a>] kernel_init+0x0/0xec
>>>>>>>>>>>>>>     [<0000252c>] ret_from_kernel_thread+0xc/0x14
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> when trying to boot the m68k:q800 machine in qemu.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> An added debug message in create_cache() shows the reason:
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> #### freeptr_offset=154 object_size=182 flags=0x310 aligned=0 sizeof(freeptr_t)=4
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> freeptr_offset would need to be 4-byte aligned but that is not the
>>>>>>>>>>>>>> case on m68k.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Why is ->work 2-byte aligned to begin with on m68k?!
>>>>>>>>>>>>
>>>>>>>>>>>> My understanding is that m68k does not align pointers.
>>>>>>>>>>>
>>>>>>>>>>> The minimum alignment for multi-byte integral values on m68k is
>>>>>>>>>>> 2 bytes.
>>>>>>>>>>>
>>>>>>>>>>> See also the comment at
>>>>>>>>>>> https://elixir.bootlin.com/linux/v6.12/source/include/linux/maple_tree.h#L46
>>>>>>>>>>
>>>>>>>>>> Maybe it's time we put m68k to bed? :-)
>>>>>>>>>>
>>>>>>>>>> We can add a forced alignment ->work to be 4 bytes, won't change
>>>>>>>>>> anything on anything remotely current. But does feel pretty hacky to
>>>>>>>>>> need to align based on some ancient thing.
>>>>>>>>>
>>>>>>>>> Why does freeptr_offset need to be 4-byte aligned?
>>>>>>>>
>>>>>>>> Didn't check, but it's slab/slub complaining using a 2-byte aligned
>>>>>>>> address for the free pointer offset. It's explicitly checking:
>>>>>>>>
>>>>>>>>           /* If a custom freelist pointer is requested make sure it's sane. */
>>>>>>>>           err = -EINVAL;
>>>>>>>>           if (args->use_freeptr_offset &&
>>>>>>>>               (args->freeptr_offset >= object_size ||
>>>>>>>>                !(flags & SLAB_TYPESAFE_BY_RCU) ||
>>>>>>>>                !IS_ALIGNED(args->freeptr_offset, sizeof(freeptr_t))))
>>>>>>>>                   goto out;
>>>>>>>
>>>>>>> It is not guaranteed that alignof(freeptr_t) >= sizeof(freeptr_t)
>>>>>>> (free_ptr is sort of a long). If freeptr_offset must be a multiple of
>>>>>>> 4 or 8 bytes,
>>>>>>> the code that assigns it must make sure that is true.
>>>>>>
>>>>>> Right, this is what the email is about...
>>>>>>
>>>>>>> I guess this is the code in fs/file_table.c:
>>>>>>>
>>>>>>>       .freeptr_offset = offsetof(struct file, f_freeptr),
>>>>>>>
>>>>>>> which references:
>>>>>>>
>>>>>>>       include/linux/fs.h:           freeptr_t               f_freeptr;
>>>>>>>
>>>>>>> I guess the simplest solution is to add an __aligned(sizeof(freeptr_t))
>>>>>>> (or __aligned(sizeof(long)) to the definition of freeptr_t:
>>>>>>>
>>>>>>>       include/linux/slab.h:typedef struct { unsigned long v; } freeptr_t;
>>>>>>
>>>>>> It's not, it's struct io_kiocb->work, as per the stack trace in this
>>>>>> email.
>>>>>
>>>>> Sorry, I was falling out of thin air into this thread...
>>>>>
>>>>> linux-next/master:io_uring/io_uring.c:          .freeptr_offset =
>>>>> offsetof(struct io_kiocb, work),
>>>>> linux-next/master:io_uring/io_uring.c:          .use_freeptr_offset = true,
>>>>>
>>>>> Apparently io_kiocb.work is of type struct io_wq_work, not freeptr_t?
>>>>> Isn't that a bit error-prone, as the slab core code expects a freeptr_t?
>>>>
>>>> It just needs the space, should not matter otherwise. But may as well
>>>> just add the union and align the freeptr so it stop complaining on m68k.
>>>
>>> Ala the below, perhaps alignment takes care of itself then?
>>>
>>
>> No, that doesn't work (I tried), at least not on its own, because the pointer
>> is still unaligned on m68k.
> 
> Yeah we'll likely need to force it. The below should work, I pressume?
> Feels pretty odd to have to align it to the size of it, when that should
> naturally occur... Crusty legacy archs.
> 

Yes, that works. Feel free to add

Tested-by: Guenter Roeck <[email protected]>

to an official patch.

Thanks,
Guenter

> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
> index 593c10a02144..8ed9c6923668 100644
> --- a/include/linux/io_uring_types.h
> +++ b/include/linux/io_uring_types.h
> @@ -674,7 +674,11 @@ struct io_kiocb {
>   	struct io_kiocb			*link;
>   	/* custom credentials, valid IFF REQ_F_CREDS is set */
>   	const struct cred		*creds;
> -	struct io_wq_work		work;
> +
> +	union {
> +		struct io_wq_work	work;
> +		freeptr_t		freeptr __aligned(sizeof(freeptr_t));
> +	};
>   
>   	struct {
>   		u64			extra1;
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index 73af59863300..86ac7df2a601 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -3812,7 +3812,7 @@ static int __init io_uring_init(void)
>   	struct kmem_cache_args kmem_args = {
>   		.useroffset = offsetof(struct io_kiocb, cmd.data),
>   		.usersize = sizeof_field(struct io_kiocb, cmd.data),
> -		.freeptr_offset = offsetof(struct io_kiocb, work),
> +		.freeptr_offset = offsetof(struct io_kiocb, freeptr),
>   		.use_freeptr_offset = true,
>   	};
>   


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 03/14] io_uring: specify freeptr usage for SLAB_TYPESAFE_BY_RCU io_kiocb cache
  2024-11-20  0:08                               ` Guenter Roeck
@ 2024-11-20  1:58                                 ` Jens Axboe
  0 siblings, 0 replies; 36+ messages in thread
From: Jens Axboe @ 2024-11-20  1:58 UTC (permalink / raw)
  To: Guenter Roeck, Geert Uytterhoeven; +Cc: io-uring, linux-m68k

On 11/19/24 5:08 PM, Guenter Roeck wrote:
> On 11/19/24 14:30, Jens Axboe wrote:
>> On 11/19/24 2:46 PM, Guenter Roeck wrote:
>>> On 11/19/24 11:49, Jens Axboe wrote:
>>>> On 11/19/24 12:44 PM, Jens Axboe wrote:
>>>>> On 11/19/24 12:41 PM, Geert Uytterhoeven wrote:
>>>>>> Hi Jens,
>>>>>>
>>>>>> On Tue, Nov 19, 2024 at 8:30?PM Jens Axboe <[email protected]> wrote:
>>>>>>> On 11/19/24 12:25 PM, Geert Uytterhoeven wrote:
>>>>>>>> On Tue, Nov 19, 2024 at 8:10?PM Jens Axboe <[email protected]> wrote:
>>>>>>>>> On 11/19/24 12:02 PM, Geert Uytterhoeven wrote:
>>>>>>>>>> On Tue, Nov 19, 2024 at 8:00?PM Jens Axboe <[email protected]> wrote:
>>>>>>>>>>> On 11/19/24 10:49 AM, Geert Uytterhoeven wrote:
>>>>>>>>>>>> On Tue, Nov 19, 2024 at 5:21?PM Guenter Roeck <[email protected]> wrote:
>>>>>>>>>>>>> On 11/19/24 08:02, Jens Axboe wrote:
>>>>>>>>>>>>>> On 11/19/24 8:36 AM, Guenter Roeck wrote:
>>>>>>>>>>>>>>> On Tue, Oct 29, 2024 at 09:16:32AM -0600, Jens Axboe wrote:
>>>>>>>>>>>>>>>> Doesn't matter right now as there's still some bytes left for it, but
>>>>>>>>>>>>>>>> let's prepare for the io_kiocb potentially growing and add a specific
>>>>>>>>>>>>>>>> freeptr offset for it.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Signed-off-by: Jens Axboe <[email protected]>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> This patch triggers:
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Kernel panic - not syncing: __kmem_cache_create_args: Failed to create slab 'io_kiocb'. Error -22
>>>>>>>>>>>>>>> CPU: 0 UID: 0 PID: 1 Comm: swapper Not tainted 6.12.0-mac-00971-g158f238aa69d #1
>>>>>>>>>>>>>>> Stack from 00c63e5c:
>>>>>>>>>>>>>>>            00c63e5c 00612c1c 00612c1c 00000300 00000001 005f3ce6 004b9044 00612c1c
>>>>>>>>>>>>>>>            004ae21e 00000310 000000b6 005f3ce6 005f3ce6 ffffffea ffffffea 00797244
>>>>>>>>>>>>>>>            00c63f20 000c6974 005ee588 004c9051 005f3ce6 ffffffea 000000a5 00c614a0
>>>>>>>>>>>>>>>            004a72c2 0002cb62 000c675e 004adb58 0076f28a 005f3ce6 000000b6 00c63ef4
>>>>>>>>>>>>>>>            00000310 00c63ef4 00000000 00000016 0076f23e 00c63f4c 00000010 00000004
>>>>>>>>>>>>>>>            00000038 0000009a 01000000 00000000 00000000 00000000 000020e0 0076f23e
>>>>>>>>>>>>>>> Call Trace: [<004b9044>] dump_stack+0xc/0x10
>>>>>>>>>>>>>>>     [<004ae21e>] panic+0xc4/0x252
>>>>>>>>>>>>>>>     [<000c6974>] __kmem_cache_create_args+0x216/0x26c
>>>>>>>>>>>>>>>     [<004a72c2>] strcpy+0x0/0x1c
>>>>>>>>>>>>>>>     [<0002cb62>] parse_args+0x0/0x1f2
>>>>>>>>>>>>>>>     [<000c675e>] __kmem_cache_create_args+0x0/0x26c
>>>>>>>>>>>>>>>     [<004adb58>] memset+0x0/0x8c
>>>>>>>>>>>>>>>     [<0076f28a>] io_uring_init+0x4c/0xca
>>>>>>>>>>>>>>>     [<0076f23e>] io_uring_init+0x0/0xca
>>>>>>>>>>>>>>>     [<000020e0>] do_one_initcall+0x32/0x192
>>>>>>>>>>>>>>>     [<0076f23e>] io_uring_init+0x0/0xca
>>>>>>>>>>>>>>>     [<0000211c>] do_one_initcall+0x6e/0x192
>>>>>>>>>>>>>>>     [<004a72c2>] strcpy+0x0/0x1c
>>>>>>>>>>>>>>>     [<0002cb62>] parse_args+0x0/0x1f2
>>>>>>>>>>>>>>>     [<000020ae>] do_one_initcall+0x0/0x192
>>>>>>>>>>>>>>>     [<0075c4e2>] kernel_init_freeable+0x1a0/0x1a4
>>>>>>>>>>>>>>>     [<0076f23e>] io_uring_init+0x0/0xca
>>>>>>>>>>>>>>>     [<004b911a>] kernel_init+0x0/0xec
>>>>>>>>>>>>>>>     [<004b912e>] kernel_init+0x14/0xec
>>>>>>>>>>>>>>>     [<004b911a>] kernel_init+0x0/0xec
>>>>>>>>>>>>>>>     [<0000252c>] ret_from_kernel_thread+0xc/0x14
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> when trying to boot the m68k:q800 machine in qemu.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> An added debug message in create_cache() shows the reason:
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> #### freeptr_offset=154 object_size=182 flags=0x310 aligned=0 sizeof(freeptr_t)=4
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> freeptr_offset would need to be 4-byte aligned but that is not the
>>>>>>>>>>>>>>> case on m68k.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Why is ->work 2-byte aligned to begin with on m68k?!
>>>>>>>>>>>>>
>>>>>>>>>>>>> My understanding is that m68k does not align pointers.
>>>>>>>>>>>>
>>>>>>>>>>>> The minimum alignment for multi-byte integral values on m68k is
>>>>>>>>>>>> 2 bytes.
>>>>>>>>>>>>
>>>>>>>>>>>> See also the comment at
>>>>>>>>>>>> https://elixir.bootlin.com/linux/v6.12/source/include/linux/maple_tree.h#L46
>>>>>>>>>>>
>>>>>>>>>>> Maybe it's time we put m68k to bed? :-)
>>>>>>>>>>>
>>>>>>>>>>> We can add a forced alignment ->work to be 4 bytes, won't change
>>>>>>>>>>> anything on anything remotely current. But does feel pretty hacky to
>>>>>>>>>>> need to align based on some ancient thing.
>>>>>>>>>>
>>>>>>>>>> Why does freeptr_offset need to be 4-byte aligned?
>>>>>>>>>
>>>>>>>>> Didn't check, but it's slab/slub complaining using a 2-byte aligned
>>>>>>>>> address for the free pointer offset. It's explicitly checking:
>>>>>>>>>
>>>>>>>>>           /* If a custom freelist pointer is requested make sure it's sane. */
>>>>>>>>>           err = -EINVAL;
>>>>>>>>>           if (args->use_freeptr_offset &&
>>>>>>>>>               (args->freeptr_offset >= object_size ||
>>>>>>>>>                !(flags & SLAB_TYPESAFE_BY_RCU) ||
>>>>>>>>>                !IS_ALIGNED(args->freeptr_offset, sizeof(freeptr_t))))
>>>>>>>>>                   goto out;
>>>>>>>>
>>>>>>>> It is not guaranteed that alignof(freeptr_t) >= sizeof(freeptr_t)
>>>>>>>> (free_ptr is sort of a long). If freeptr_offset must be a multiple of
>>>>>>>> 4 or 8 bytes,
>>>>>>>> the code that assigns it must make sure that is true.
>>>>>>>
>>>>>>> Right, this is what the email is about...
>>>>>>>
>>>>>>>> I guess this is the code in fs/file_table.c:
>>>>>>>>
>>>>>>>>       .freeptr_offset = offsetof(struct file, f_freeptr),
>>>>>>>>
>>>>>>>> which references:
>>>>>>>>
>>>>>>>>       include/linux/fs.h:           freeptr_t               f_freeptr;
>>>>>>>>
>>>>>>>> I guess the simplest solution is to add an __aligned(sizeof(freeptr_t))
>>>>>>>> (or __aligned(sizeof(long)) to the definition of freeptr_t:
>>>>>>>>
>>>>>>>>       include/linux/slab.h:typedef struct { unsigned long v; } freeptr_t;
>>>>>>>
>>>>>>> It's not, it's struct io_kiocb->work, as per the stack trace in this
>>>>>>> email.
>>>>>>
>>>>>> Sorry, I was falling out of thin air into this thread...
>>>>>>
>>>>>> linux-next/master:io_uring/io_uring.c:          .freeptr_offset =
>>>>>> offsetof(struct io_kiocb, work),
>>>>>> linux-next/master:io_uring/io_uring.c:          .use_freeptr_offset = true,
>>>>>>
>>>>>> Apparently io_kiocb.work is of type struct io_wq_work, not freeptr_t?
>>>>>> Isn't that a bit error-prone, as the slab core code expects a freeptr_t?
>>>>>
>>>>> It just needs the space, should not matter otherwise. But may as well
>>>>> just add the union and align the freeptr so it stop complaining on m68k.
>>>>
>>>> Ala the below, perhaps alignment takes care of itself then?
>>>>
>>>
>>> No, that doesn't work (I tried), at least not on its own, because the pointer
>>> is still unaligned on m68k.
>>
>> Yeah we'll likely need to force it. The below should work, I pressume?
>> Feels pretty odd to have to align it to the size of it, when that should
>> naturally occur... Crusty legacy archs.
>>
> 
> Yes, that works. Feel free to add
> 
> Tested-by: Guenter Roeck <[email protected]>
> 
> to an official patch.

Thanks for testing, will add that and send it out (and queue it up for
later this merge window).

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 03/14] io_uring: specify freeptr usage for SLAB_TYPESAFE_BY_RCU io_kiocb cache
  2024-11-19 22:30                             ` Jens Axboe
  2024-11-20  0:08                               ` Guenter Roeck
@ 2024-11-20  8:19                               ` Geert Uytterhoeven
  2024-11-20  8:47                                 ` Vlastimil Babka
  1 sibling, 1 reply; 36+ messages in thread
From: Geert Uytterhoeven @ 2024-11-20  8:19 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Guenter Roeck, io-uring, linux-m68k, Christian Brauner,
	Vlastimil Babka, Linux MM, Linux Kernel Mailing List

Hi Jens,

CC Christian (who added the check)
CC Vlastimil (who suggested the check)

On Tue, Nov 19, 2024 at 11:30 PM Jens Axboe <[email protected]> wrote:
> On 11/19/24 2:46 PM, Guenter Roeck wrote:
> > On 11/19/24 11:49, Jens Axboe wrote:
> >> On 11/19/24 12:44 PM, Jens Axboe wrote:
> >>>> On Tue, Nov 19, 2024 at 8:30?PM Jens Axboe <[email protected]> wrote:
> >>>>> On 11/19/24 12:25 PM, Geert Uytterhoeven wrote:
> >>>>>> On Tue, Nov 19, 2024 at 8:10?PM Jens Axboe <[email protected]> wrote:
> >>>>>>> On 11/19/24 12:02 PM, Geert Uytterhoeven wrote:
> >>>>>>>> On Tue, Nov 19, 2024 at 8:00?PM Jens Axboe <[email protected]> wrote:
> >>>>>>>>> On 11/19/24 10:49 AM, Geert Uytterhoeven wrote:
> >>>>>>>>>> On Tue, Nov 19, 2024 at 5:21?PM Guenter Roeck <[email protected]> wrote:
> >>>>>>>>>>> On 11/19/24 08:02, Jens Axboe wrote:
> >>>>>>>>>>>> On 11/19/24 8:36 AM, Guenter Roeck wrote:
> >>>>>>>>>>>>> On Tue, Oct 29, 2024 at 09:16:32AM -0600, Jens Axboe wrote:
> >>>>>>>>>>>>>> Doesn't matter right now as there's still some bytes left for it, but
> >>>>>>>>>>>>>> let's prepare for the io_kiocb potentially growing and add a specific
> >>>>>>>>>>>>>> freeptr offset for it.
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> Signed-off-by: Jens Axboe <[email protected]>
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> This patch triggers:
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> Kernel panic - not syncing: __kmem_cache_create_args: Failed to create slab 'io_kiocb'. Error -22
> >>>>>>>>>>>>> CPU: 0 UID: 0 PID: 1 Comm: swapper Not tainted 6.12.0-mac-00971-g158f238aa69d #1
> >>>>>>>>>>>>> Stack from 00c63e5c:
> >>>>>>>>>>>>>           00c63e5c 00612c1c 00612c1c 00000300 00000001 005f3ce6 004b9044 00612c1c
> >>>>>>>>>>>>>           004ae21e 00000310 000000b6 005f3ce6 005f3ce6 ffffffea ffffffea 00797244
> >>>>>>>>>>>>>           00c63f20 000c6974 005ee588 004c9051 005f3ce6 ffffffea 000000a5 00c614a0
> >>>>>>>>>>>>>           004a72c2 0002cb62 000c675e 004adb58 0076f28a 005f3ce6 000000b6 00c63ef4
> >>>>>>>>>>>>>           00000310 00c63ef4 00000000 00000016 0076f23e 00c63f4c 00000010 00000004
> >>>>>>>>>>>>>           00000038 0000009a 01000000 00000000 00000000 00000000 000020e0 0076f23e
> >>>>>>>>>>>>> Call Trace: [<004b9044>] dump_stack+0xc/0x10
> >>>>>>>>>>>>>    [<004ae21e>] panic+0xc4/0x252
> >>>>>>>>>>>>>    [<000c6974>] __kmem_cache_create_args+0x216/0x26c
> >>>>>>>>>>>>>    [<004a72c2>] strcpy+0x0/0x1c
> >>>>>>>>>>>>>    [<0002cb62>] parse_args+0x0/0x1f2
> >>>>>>>>>>>>>    [<000c675e>] __kmem_cache_create_args+0x0/0x26c
> >>>>>>>>>>>>>    [<004adb58>] memset+0x0/0x8c
> >>>>>>>>>>>>>    [<0076f28a>] io_uring_init+0x4c/0xca
> >>>>>>>>>>>>>    [<0076f23e>] io_uring_init+0x0/0xca
> >>>>>>>>>>>>>    [<000020e0>] do_one_initcall+0x32/0x192
> >>>>>>>>>>>>>    [<0076f23e>] io_uring_init+0x0/0xca
> >>>>>>>>>>>>>    [<0000211c>] do_one_initcall+0x6e/0x192
> >>>>>>>>>>>>>    [<004a72c2>] strcpy+0x0/0x1c
> >>>>>>>>>>>>>    [<0002cb62>] parse_args+0x0/0x1f2
> >>>>>>>>>>>>>    [<000020ae>] do_one_initcall+0x0/0x192
> >>>>>>>>>>>>>    [<0075c4e2>] kernel_init_freeable+0x1a0/0x1a4
> >>>>>>>>>>>>>    [<0076f23e>] io_uring_init+0x0/0xca
> >>>>>>>>>>>>>    [<004b911a>] kernel_init+0x0/0xec
> >>>>>>>>>>>>>    [<004b912e>] kernel_init+0x14/0xec
> >>>>>>>>>>>>>    [<004b911a>] kernel_init+0x0/0xec
> >>>>>>>>>>>>>    [<0000252c>] ret_from_kernel_thread+0xc/0x14
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> when trying to boot the m68k:q800 machine in qemu.
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> An added debug message in create_cache() shows the reason:
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> #### freeptr_offset=154 object_size=182 flags=0x310 aligned=0 sizeof(freeptr_t)=4
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> freeptr_offset would need to be 4-byte aligned but that is not the
> >>>>>>>>>>>>> case on m68k.
> >>>>>>>>>>>>
> >>>>>>>>>>>> Why is ->work 2-byte aligned to begin with on m68k?!
> >>>>>>>>>>>
> >>>>>>>>>>> My understanding is that m68k does not align pointers.
> >>>>>>>>>>
> >>>>>>>>>> The minimum alignment for multi-byte integral values on m68k is
> >>>>>>>>>> 2 bytes.
> >>>>>>>>>>
> >>>>>>>>>> See also the comment at
> >>>>>>>>>> https://elixir.bootlin.com/linux/v6.12/source/include/linux/maple_tree.h#L46
> >>>>>>>>>
> >>>>>>>>> Maybe it's time we put m68k to bed? :-)
> >>>>>>>>>
> >>>>>>>>> We can add a forced alignment ->work to be 4 bytes, won't change
> >>>>>>>>> anything on anything remotely current. But does feel pretty hacky to
> >>>>>>>>> need to align based on some ancient thing.
> >>>>>>>>
> >>>>>>>> Why does freeptr_offset need to be 4-byte aligned?
> >>>>>>>
> >>>>>>> Didn't check, but it's slab/slub complaining using a 2-byte aligned
> >>>>>>> address for the free pointer offset. It's explicitly checking:
> >>>>>>>
> >>>>>>>          /* If a custom freelist pointer is requested make sure it's sane. */
> >>>>>>>          err = -EINVAL;
> >>>>>>>          if (args->use_freeptr_offset &&
> >>>>>>>              (args->freeptr_offset >= object_size ||
> >>>>>>>               !(flags & SLAB_TYPESAFE_BY_RCU) ||
> >>>>>>>               !IS_ALIGNED(args->freeptr_offset, sizeof(freeptr_t))))
                                                          ^^^^^^

> >>>>>>>                  goto out;
> >>>>>>
> >>>>>> It is not guaranteed that alignof(freeptr_t) >= sizeof(freeptr_t)
> >>>>>> (free_ptr is sort of a long). If freeptr_offset must be a multiple of
> >>>>>> 4 or 8 bytes,
> >>>>>> the code that assigns it must make sure that is true.
> >>>>>
> >>>>> Right, this is what the email is about...
> >>>>>
> >>>>>> I guess this is the code in fs/file_table.c:
> >>>>>>
> >>>>>>      .freeptr_offset = offsetof(struct file, f_freeptr),
> >>>>>>
> >>>>>> which references:
> >>>>>>
> >>>>>>      include/linux/fs.h:           freeptr_t               f_freeptr;
> >>>>>>
> >>>>>> I guess the simplest solution is to add an __aligned(sizeof(freeptr_t))
> >>>>>> (or __aligned(sizeof(long)) to the definition of freeptr_t:
> >>>>>>
> >>>>>>      include/linux/slab.h:typedef struct { unsigned long v; } freeptr_t;
> >>>>>
> >>>>> It's not, it's struct io_kiocb->work, as per the stack trace in this
> >>>>> email.
> >>>>
> >>>> Sorry, I was falling out of thin air into this thread...
> >>>>
> >>>> linux-next/master:io_uring/io_uring.c:          .freeptr_offset =
> >>>> offsetof(struct io_kiocb, work),
> >>>> linux-next/master:io_uring/io_uring.c:          .use_freeptr_offset = true,
> >>>>
> >>>> Apparently io_kiocb.work is of type struct io_wq_work, not freeptr_t?
> >>>> Isn't that a bit error-prone, as the slab core code expects a freeptr_t?
> >>>
> >>> It just needs the space, should not matter otherwise. But may as well
> >>> just add the union and align the freeptr so it stop complaining on m68k.
> >>
> >> Ala the below, perhaps alignment takes care of itself then?
> >
> > No, that doesn't work (I tried), at least not on its own, because the pointer
> > is still unaligned on m68k.
>
> Yeah we'll likely need to force it. The below should work, I pressume?
> Feels pretty odd to have to align it to the size of it, when that should
> naturally occur... Crusty legacy archs.
>
> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
> index 593c10a02144..8ed9c6923668 100644
> --- a/include/linux/io_uring_types.h
> +++ b/include/linux/io_uring_types.h
> @@ -674,7 +674,11 @@ struct io_kiocb {
>         struct io_kiocb                 *link;
>         /* custom credentials, valid IFF REQ_F_CREDS is set */
>         const struct cred               *creds;
> -       struct io_wq_work               work;
> +
> +       union {
> +               struct io_wq_work       work;
> +               freeptr_t               freeptr __aligned(sizeof(freeptr_t));

I'd rather add the __aligned() to the definition of freeptr_t, so it
applies to all (future) users.

But my main question stays: why is the slab code checking
IS_ALIGNED(args->freeptr_offset, sizeof(freeptr_t)?
Perhaps that was just intended to be __alignof__ instead of sizeof()?

> +       };
>
>         struct {
>                 u64                     extra1;
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index 73af59863300..86ac7df2a601 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -3812,7 +3812,7 @@ static int __init io_uring_init(void)
>         struct kmem_cache_args kmem_args = {
>                 .useroffset = offsetof(struct io_kiocb, cmd.data),
>                 .usersize = sizeof_field(struct io_kiocb, cmd.data),
> -               .freeptr_offset = offsetof(struct io_kiocb, work),
> +               .freeptr_offset = offsetof(struct io_kiocb, freeptr),
>                 .use_freeptr_offset = true,
>         };

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- [email protected]

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 03/14] io_uring: specify freeptr usage for SLAB_TYPESAFE_BY_RCU io_kiocb cache
  2024-11-20  8:19                               ` Geert Uytterhoeven
@ 2024-11-20  8:47                                 ` Vlastimil Babka
  2024-11-20  9:07                                   ` Geert Uytterhoeven
  0 siblings, 1 reply; 36+ messages in thread
From: Vlastimil Babka @ 2024-11-20  8:47 UTC (permalink / raw)
  To: Geert Uytterhoeven, Jens Axboe, Jann Horn
  Cc: Guenter Roeck, io-uring, linux-m68k, Christian Brauner, Linux MM,
	Linux Kernel Mailing List

On 11/20/24 09:19, Geert Uytterhoeven wrote:
> Hi Jens,
> 
> CC Christian (who added the check)
> CC Vlastimil (who suggested the check)
> 
> On Tue, Nov 19, 2024 at 11:30 PM Jens Axboe <[email protected]> wrote:
>> On 11/19/24 2:46 PM, Guenter Roeck wrote:
>> > On 11/19/24 11:49, Jens Axboe wrote:
>> >> On 11/19/24 12:44 PM, Jens Axboe wrote:
>> >>>> On Tue, Nov 19, 2024 at 8:30?PM Jens Axboe <[email protected]> wrote:
>> >>>>> On 11/19/24 12:25 PM, Geert Uytterhoeven wrote:
>> >>>>>> On Tue, Nov 19, 2024 at 8:10?PM Jens Axboe <[email protected]> wrote:
>> >>>>>>> On 11/19/24 12:02 PM, Geert Uytterhoeven wrote:
>> >>>>>>>> On Tue, Nov 19, 2024 at 8:00?PM Jens Axboe <[email protected]> wrote:
>> >>>>>>>>> On 11/19/24 10:49 AM, Geert Uytterhoeven wrote:
>> >>>>>>>>>> On Tue, Nov 19, 2024 at 5:21?PM Guenter Roeck <[email protected]> wrote:
>> >>>>>>>>>>> On 11/19/24 08:02, Jens Axboe wrote:
>> >>>>>>>>>>>> On 11/19/24 8:36 AM, Guenter Roeck wrote:
>> >>>>>>>>>>>>> On Tue, Oct 29, 2024 at 09:16:32AM -0600, Jens Axboe wrote:
>> >>>>>>>>>>>>>> Doesn't matter right now as there's still some bytes left for it, but
>> >>>>>>>>>>>>>> let's prepare for the io_kiocb potentially growing and add a specific
>> >>>>>>>>>>>>>> freeptr offset for it.
>> >>>>>>>>>>>>>>
>> >>>>>>>>>>>>>> Signed-off-by: Jens Axboe <[email protected]>
>> >>>>>>>>>>>>>
>> >>>>>>>>>>>>> This patch triggers:
>> >>>>>>>>>>>>>
>> >>>>>>>>>>>>> Kernel panic - not syncing: __kmem_cache_create_args: Failed to create slab 'io_kiocb'. Error -22
>> >>>>>>>>>>>>> CPU: 0 UID: 0 PID: 1 Comm: swapper Not tainted 6.12.0-mac-00971-g158f238aa69d #1
>> >>>>>>>>>>>>> Stack from 00c63e5c:
>> >>>>>>>>>>>>>           00c63e5c 00612c1c 00612c1c 00000300 00000001 005f3ce6 004b9044 00612c1c
>> >>>>>>>>>>>>>           004ae21e 00000310 000000b6 005f3ce6 005f3ce6 ffffffea ffffffea 00797244
>> >>>>>>>>>>>>>           00c63f20 000c6974 005ee588 004c9051 005f3ce6 ffffffea 000000a5 00c614a0
>> >>>>>>>>>>>>>           004a72c2 0002cb62 000c675e 004adb58 0076f28a 005f3ce6 000000b6 00c63ef4
>> >>>>>>>>>>>>>           00000310 00c63ef4 00000000 00000016 0076f23e 00c63f4c 00000010 00000004
>> >>>>>>>>>>>>>           00000038 0000009a 01000000 00000000 00000000 00000000 000020e0 0076f23e
>> >>>>>>>>>>>>> Call Trace: [<004b9044>] dump_stack+0xc/0x10
>> >>>>>>>>>>>>>    [<004ae21e>] panic+0xc4/0x252
>> >>>>>>>>>>>>>    [<000c6974>] __kmem_cache_create_args+0x216/0x26c
>> >>>>>>>>>>>>>    [<004a72c2>] strcpy+0x0/0x1c
>> >>>>>>>>>>>>>    [<0002cb62>] parse_args+0x0/0x1f2
>> >>>>>>>>>>>>>    [<000c675e>] __kmem_cache_create_args+0x0/0x26c
>> >>>>>>>>>>>>>    [<004adb58>] memset+0x0/0x8c
>> >>>>>>>>>>>>>    [<0076f28a>] io_uring_init+0x4c/0xca
>> >>>>>>>>>>>>>    [<0076f23e>] io_uring_init+0x0/0xca
>> >>>>>>>>>>>>>    [<000020e0>] do_one_initcall+0x32/0x192
>> >>>>>>>>>>>>>    [<0076f23e>] io_uring_init+0x0/0xca
>> >>>>>>>>>>>>>    [<0000211c>] do_one_initcall+0x6e/0x192
>> >>>>>>>>>>>>>    [<004a72c2>] strcpy+0x0/0x1c
>> >>>>>>>>>>>>>    [<0002cb62>] parse_args+0x0/0x1f2
>> >>>>>>>>>>>>>    [<000020ae>] do_one_initcall+0x0/0x192
>> >>>>>>>>>>>>>    [<0075c4e2>] kernel_init_freeable+0x1a0/0x1a4
>> >>>>>>>>>>>>>    [<0076f23e>] io_uring_init+0x0/0xca
>> >>>>>>>>>>>>>    [<004b911a>] kernel_init+0x0/0xec
>> >>>>>>>>>>>>>    [<004b912e>] kernel_init+0x14/0xec
>> >>>>>>>>>>>>>    [<004b911a>] kernel_init+0x0/0xec
>> >>>>>>>>>>>>>    [<0000252c>] ret_from_kernel_thread+0xc/0x14
>> >>>>>>>>>>>>>
>> >>>>>>>>>>>>> when trying to boot the m68k:q800 machine in qemu.
>> >>>>>>>>>>>>>
>> >>>>>>>>>>>>> An added debug message in create_cache() shows the reason:
>> >>>>>>>>>>>>>
>> >>>>>>>>>>>>> #### freeptr_offset=154 object_size=182 flags=0x310 aligned=0 sizeof(freeptr_t)=4
>> >>>>>>>>>>>>>
>> >>>>>>>>>>>>> freeptr_offset would need to be 4-byte aligned but that is not the
>> >>>>>>>>>>>>> case on m68k.
>> >>>>>>>>>>>>
>> >>>>>>>>>>>> Why is ->work 2-byte aligned to begin with on m68k?!
>> >>>>>>>>>>>
>> >>>>>>>>>>> My understanding is that m68k does not align pointers.
>> >>>>>>>>>>
>> >>>>>>>>>> The minimum alignment for multi-byte integral values on m68k is
>> >>>>>>>>>> 2 bytes.
>> >>>>>>>>>>
>> >>>>>>>>>> See also the comment at
>> >>>>>>>>>> https://elixir.bootlin.com/linux/v6.12/source/include/linux/maple_tree.h#L46
>> >>>>>>>>>
>> >>>>>>>>> Maybe it's time we put m68k to bed? :-)
>> >>>>>>>>>
>> >>>>>>>>> We can add a forced alignment ->work to be 4 bytes, won't change
>> >>>>>>>>> anything on anything remotely current. But does feel pretty hacky to
>> >>>>>>>>> need to align based on some ancient thing.
>> >>>>>>>>
>> >>>>>>>> Why does freeptr_offset need to be 4-byte aligned?
>> >>>>>>>
>> >>>>>>> Didn't check, but it's slab/slub complaining using a 2-byte aligned
>> >>>>>>> address for the free pointer offset. It's explicitly checking:
>> >>>>>>>
>> >>>>>>>          /* If a custom freelist pointer is requested make sure it's sane. */
>> >>>>>>>          err = -EINVAL;
>> >>>>>>>          if (args->use_freeptr_offset &&
>> >>>>>>>              (args->freeptr_offset >= object_size ||
>> >>>>>>>               !(flags & SLAB_TYPESAFE_BY_RCU) ||
>> >>>>>>>               !IS_ALIGNED(args->freeptr_offset, sizeof(freeptr_t))))
>                                                           ^^^^^^
> 
>> >>>>>>>                  goto out;
>> >>>>>>
>> >>>>>> It is not guaranteed that alignof(freeptr_t) >= sizeof(freeptr_t)
>> >>>>>> (free_ptr is sort of a long). If freeptr_offset must be a multiple of
>> >>>>>> 4 or 8 bytes,
>> >>>>>> the code that assigns it must make sure that is true.
>> >>>>>
>> >>>>> Right, this is what the email is about...
>> >>>>>
>> >>>>>> I guess this is the code in fs/file_table.c:
>> >>>>>>
>> >>>>>>      .freeptr_offset = offsetof(struct file, f_freeptr),
>> >>>>>>
>> >>>>>> which references:
>> >>>>>>
>> >>>>>>      include/linux/fs.h:           freeptr_t               f_freeptr;
>> >>>>>>
>> >>>>>> I guess the simplest solution is to add an __aligned(sizeof(freeptr_t))
>> >>>>>> (or __aligned(sizeof(long)) to the definition of freeptr_t:
>> >>>>>>
>> >>>>>>      include/linux/slab.h:typedef struct { unsigned long v; } freeptr_t;
>> >>>>>
>> >>>>> It's not, it's struct io_kiocb->work, as per the stack trace in this
>> >>>>> email.
>> >>>>
>> >>>> Sorry, I was falling out of thin air into this thread...
>> >>>>
>> >>>> linux-next/master:io_uring/io_uring.c:          .freeptr_offset =
>> >>>> offsetof(struct io_kiocb, work),
>> >>>> linux-next/master:io_uring/io_uring.c:          .use_freeptr_offset = true,
>> >>>>
>> >>>> Apparently io_kiocb.work is of type struct io_wq_work, not freeptr_t?
>> >>>> Isn't that a bit error-prone, as the slab core code expects a freeptr_t?
>> >>>
>> >>> It just needs the space, should not matter otherwise. But may as well
>> >>> just add the union and align the freeptr so it stop complaining on m68k.
>> >>
>> >> Ala the below, perhaps alignment takes care of itself then?
>> >
>> > No, that doesn't work (I tried), at least not on its own, because the pointer
>> > is still unaligned on m68k.
>>
>> Yeah we'll likely need to force it. The below should work, I pressume?
>> Feels pretty odd to have to align it to the size of it, when that should
>> naturally occur... Crusty legacy archs.
>>
>> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
>> index 593c10a02144..8ed9c6923668 100644
>> --- a/include/linux/io_uring_types.h
>> +++ b/include/linux/io_uring_types.h
>> @@ -674,7 +674,11 @@ struct io_kiocb {
>>         struct io_kiocb                 *link;
>>         /* custom credentials, valid IFF REQ_F_CREDS is set */
>>         const struct cred               *creds;
>> -       struct io_wq_work               work;
>> +
>> +       union {
>> +               struct io_wq_work       work;
>> +               freeptr_t               freeptr __aligned(sizeof(freeptr_t));
> 
> I'd rather add the __aligned() to the definition of freeptr_t, so it
> applies to all (future) users.
> 
> But my main question stays: why is the slab code checking
> IS_ALIGNED(args->freeptr_offset, sizeof(freeptr_t)?

I believe it's to match how SLUB normally calculates the offset if no
explicit one is given, in calculate_sizes():

s->offset = ALIGN_DOWN(s->object_size / 2, sizeof(void *));

Yes there's a sizeof(void *) because freepointer used to be just that and we
forgot to update this place when freepointer_t was introduced (by Jann in
44f6a42d49350) for handling CONFIG_SLAB_FREELIST_HARDENED. In
get_freepointer() you can see how there's a cast to a pointer eventually.

Does m68k have different alignment for pointer and unsigned long or both are
2 bytes? Or any other arch, i.e. should get_freepointer be a union with
unsigned long and void * instead? (or it doesn't matter?)

> Perhaps that was just intended to be __alignof__ instead of sizeof()?

Would it do the right thing everywhere, given the explanation above?

Thanks,
Vlastimil

>> +       };
>>
>>         struct {
>>                 u64                     extra1;
>> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
>> index 73af59863300..86ac7df2a601 100644
>> --- a/io_uring/io_uring.c
>> +++ b/io_uring/io_uring.c
>> @@ -3812,7 +3812,7 @@ static int __init io_uring_init(void)
>>         struct kmem_cache_args kmem_args = {
>>                 .useroffset = offsetof(struct io_kiocb, cmd.data),
>>                 .usersize = sizeof_field(struct io_kiocb, cmd.data),
>> -               .freeptr_offset = offsetof(struct io_kiocb, work),
>> +               .freeptr_offset = offsetof(struct io_kiocb, freeptr),
>>                 .use_freeptr_offset = true,
>>         };
> 
> Gr{oetje,eeting}s,
> 
>                         Geert
> 


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 03/14] io_uring: specify freeptr usage for SLAB_TYPESAFE_BY_RCU io_kiocb cache
  2024-11-20  8:47                                 ` Vlastimil Babka
@ 2024-11-20  9:07                                   ` Geert Uytterhoeven
  2024-11-20  9:37                                     ` Vlastimil Babka
  0 siblings, 1 reply; 36+ messages in thread
From: Geert Uytterhoeven @ 2024-11-20  9:07 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Jens Axboe, Jann Horn, Guenter Roeck, io-uring, linux-m68k,
	Christian Brauner, Linux MM, Linux Kernel Mailing List

Hi Vlastimil,

On Wed, Nov 20, 2024 at 9:47 AM Vlastimil Babka <[email protected]> wrote:
> On 11/20/24 09:19, Geert Uytterhoeven wrote:
> > On Tue, Nov 19, 2024 at 11:30 PM Jens Axboe <[email protected]> wrote:
> >> On 11/19/24 2:46 PM, Guenter Roeck wrote:
> >> > On 11/19/24 11:49, Jens Axboe wrote:
> >> >> On 11/19/24 12:44 PM, Jens Axboe wrote:
> >> >>>> On Tue, Nov 19, 2024 at 8:30?PM Jens Axboe <[email protected]> wrote:
> >> >>>>> On 11/19/24 12:25 PM, Geert Uytterhoeven wrote:
> >> >>>>>> On Tue, Nov 19, 2024 at 8:10?PM Jens Axboe <[email protected]> wrote:
> >> >>>>>>> On 11/19/24 12:02 PM, Geert Uytterhoeven wrote:
> >> >>>>>>>> On Tue, Nov 19, 2024 at 8:00?PM Jens Axboe <[email protected]> wrote:
> >> >>>>>>>>> On 11/19/24 10:49 AM, Geert Uytterhoeven wrote:
> >> >>>>>>>>>> On Tue, Nov 19, 2024 at 5:21?PM Guenter Roeck <[email protected]> wrote:
> >> >>>>>>>>>>> On 11/19/24 08:02, Jens Axboe wrote:
> >> >>>>>>>>>>>> On 11/19/24 8:36 AM, Guenter Roeck wrote:
> >> >>>>>>>>>>>>> On Tue, Oct 29, 2024 at 09:16:32AM -0600, Jens Axboe wrote:
> >> >>>>>>>>>>>>>> Doesn't matter right now as there's still some bytes left for it, but
> >> >>>>>>>>>>>>>> let's prepare for the io_kiocb potentially growing and add a specific
> >> >>>>>>>>>>>>>> freeptr offset for it.
> >> >>>>>>>>>>>>>>
> >> >>>>>>>>>>>>>> Signed-off-by: Jens Axboe <[email protected]>
> >> >>>>>>>>>>>>>
> >> >>>>>>>>>>>>> This patch triggers:
> >> >>>>>>>>>>>>>
> >> >>>>>>>>>>>>> Kernel panic - not syncing: __kmem_cache_create_args: Failed to create slab 'io_kiocb'. Error -22
> >> >>>>>>>>>>>>> CPU: 0 UID: 0 PID: 1 Comm: swapper Not tainted 6.12.0-mac-00971-g158f238aa69d #1
> >> >>>>>>>>>>>>> Stack from 00c63e5c:
> >> >>>>>>>>>>>>>           00c63e5c 00612c1c 00612c1c 00000300 00000001 005f3ce6 004b9044 00612c1c
> >> >>>>>>>>>>>>>           004ae21e 00000310 000000b6 005f3ce6 005f3ce6 ffffffea ffffffea 00797244
> >> >>>>>>>>>>>>>           00c63f20 000c6974 005ee588 004c9051 005f3ce6 ffffffea 000000a5 00c614a0
> >> >>>>>>>>>>>>>           004a72c2 0002cb62 000c675e 004adb58 0076f28a 005f3ce6 000000b6 00c63ef4
> >> >>>>>>>>>>>>>           00000310 00c63ef4 00000000 00000016 0076f23e 00c63f4c 00000010 00000004
> >> >>>>>>>>>>>>>           00000038 0000009a 01000000 00000000 00000000 00000000 000020e0 0076f23e
> >> >>>>>>>>>>>>> Call Trace: [<004b9044>] dump_stack+0xc/0x10
> >> >>>>>>>>>>>>>    [<004ae21e>] panic+0xc4/0x252
> >> >>>>>>>>>>>>>    [<000c6974>] __kmem_cache_create_args+0x216/0x26c
> >> >>>>>>>>>>>>>    [<004a72c2>] strcpy+0x0/0x1c
> >> >>>>>>>>>>>>>    [<0002cb62>] parse_args+0x0/0x1f2
> >> >>>>>>>>>>>>>    [<000c675e>] __kmem_cache_create_args+0x0/0x26c
> >> >>>>>>>>>>>>>    [<004adb58>] memset+0x0/0x8c
> >> >>>>>>>>>>>>>    [<0076f28a>] io_uring_init+0x4c/0xca
> >> >>>>>>>>>>>>>    [<0076f23e>] io_uring_init+0x0/0xca
> >> >>>>>>>>>>>>>    [<000020e0>] do_one_initcall+0x32/0x192
> >> >>>>>>>>>>>>>    [<0076f23e>] io_uring_init+0x0/0xca
> >> >>>>>>>>>>>>>    [<0000211c>] do_one_initcall+0x6e/0x192
> >> >>>>>>>>>>>>>    [<004a72c2>] strcpy+0x0/0x1c
> >> >>>>>>>>>>>>>    [<0002cb62>] parse_args+0x0/0x1f2
> >> >>>>>>>>>>>>>    [<000020ae>] do_one_initcall+0x0/0x192
> >> >>>>>>>>>>>>>    [<0075c4e2>] kernel_init_freeable+0x1a0/0x1a4
> >> >>>>>>>>>>>>>    [<0076f23e>] io_uring_init+0x0/0xca
> >> >>>>>>>>>>>>>    [<004b911a>] kernel_init+0x0/0xec
> >> >>>>>>>>>>>>>    [<004b912e>] kernel_init+0x14/0xec
> >> >>>>>>>>>>>>>    [<004b911a>] kernel_init+0x0/0xec
> >> >>>>>>>>>>>>>    [<0000252c>] ret_from_kernel_thread+0xc/0x14
> >> >>>>>>>>>>>>>
> >> >>>>>>>>>>>>> when trying to boot the m68k:q800 machine in qemu.
> >> >>>>>>>>>>>>>
> >> >>>>>>>>>>>>> An added debug message in create_cache() shows the reason:
> >> >>>>>>>>>>>>>
> >> >>>>>>>>>>>>> #### freeptr_offset=154 object_size=182 flags=0x310 aligned=0 sizeof(freeptr_t)=4
> >> >>>>>>>>>>>>>
> >> >>>>>>>>>>>>> freeptr_offset would need to be 4-byte aligned but that is not the
> >> >>>>>>>>>>>>> case on m68k.
> >> >>>>>>>>>>>>
> >> >>>>>>>>>>>> Why is ->work 2-byte aligned to begin with on m68k?!
> >> >>>>>>>>>>>
> >> >>>>>>>>>>> My understanding is that m68k does not align pointers.
> >> >>>>>>>>>>
> >> >>>>>>>>>> The minimum alignment for multi-byte integral values on m68k is
> >> >>>>>>>>>> 2 bytes.
> >> >>>>>>>>>>
> >> >>>>>>>>>> See also the comment at
> >> >>>>>>>>>> https://elixir.bootlin.com/linux/v6.12/source/include/linux/maple_tree.h#L46
> >> >>>>>>>>>
> >> >>>>>>>>> Maybe it's time we put m68k to bed? :-)
> >> >>>>>>>>>
> >> >>>>>>>>> We can add a forced alignment ->work to be 4 bytes, won't change
> >> >>>>>>>>> anything on anything remotely current. But does feel pretty hacky to
> >> >>>>>>>>> need to align based on some ancient thing.
> >> >>>>>>>>
> >> >>>>>>>> Why does freeptr_offset need to be 4-byte aligned?
> >> >>>>>>>
> >> >>>>>>> Didn't check, but it's slab/slub complaining using a 2-byte aligned
> >> >>>>>>> address for the free pointer offset. It's explicitly checking:
> >> >>>>>>>
> >> >>>>>>>          /* If a custom freelist pointer is requested make sure it's sane. */
> >> >>>>>>>          err = -EINVAL;
> >> >>>>>>>          if (args->use_freeptr_offset &&
> >> >>>>>>>              (args->freeptr_offset >= object_size ||
> >> >>>>>>>               !(flags & SLAB_TYPESAFE_BY_RCU) ||
> >> >>>>>>>               !IS_ALIGNED(args->freeptr_offset, sizeof(freeptr_t))))
> >                                                           ^^^^^^
> >
> >> >>>>>>>                  goto out;
> >> >>>>>>
> >> >>>>>> It is not guaranteed that alignof(freeptr_t) >= sizeof(freeptr_t)
> >> >>>>>> (free_ptr is sort of a long). If freeptr_offset must be a multiple of
> >> >>>>>> 4 or 8 bytes,
> >> >>>>>> the code that assigns it must make sure that is true.
> >> >>>>>
> >> >>>>> Right, this is what the email is about...
> >> >>>>>
> >> >>>>>> I guess this is the code in fs/file_table.c:
> >> >>>>>>
> >> >>>>>>      .freeptr_offset = offsetof(struct file, f_freeptr),
> >> >>>>>>
> >> >>>>>> which references:
> >> >>>>>>
> >> >>>>>>      include/linux/fs.h:           freeptr_t               f_freeptr;
> >> >>>>>>
> >> >>>>>> I guess the simplest solution is to add an __aligned(sizeof(freeptr_t))
> >> >>>>>> (or __aligned(sizeof(long)) to the definition of freeptr_t:
> >> >>>>>>
> >> >>>>>>      include/linux/slab.h:typedef struct { unsigned long v; } freeptr_t;
> >> >>>>>
> >> >>>>> It's not, it's struct io_kiocb->work, as per the stack trace in this
> >> >>>>> email.
> >> >>>>
> >> >>>> Sorry, I was falling out of thin air into this thread...
> >> >>>>
> >> >>>> linux-next/master:io_uring/io_uring.c:          .freeptr_offset =
> >> >>>> offsetof(struct io_kiocb, work),
> >> >>>> linux-next/master:io_uring/io_uring.c:          .use_freeptr_offset = true,
> >> >>>>
> >> >>>> Apparently io_kiocb.work is of type struct io_wq_work, not freeptr_t?
> >> >>>> Isn't that a bit error-prone, as the slab core code expects a freeptr_t?
> >> >>>
> >> >>> It just needs the space, should not matter otherwise. But may as well
> >> >>> just add the union and align the freeptr so it stop complaining on m68k.
> >> >>
> >> >> Ala the below, perhaps alignment takes care of itself then?
> >> >
> >> > No, that doesn't work (I tried), at least not on its own, because the pointer
> >> > is still unaligned on m68k.
> >>
> >> Yeah we'll likely need to force it. The below should work, I pressume?
> >> Feels pretty odd to have to align it to the size of it, when that should
> >> naturally occur... Crusty legacy archs.
> >>
> >> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
> >> index 593c10a02144..8ed9c6923668 100644
> >> --- a/include/linux/io_uring_types.h
> >> +++ b/include/linux/io_uring_types.h
> >> @@ -674,7 +674,11 @@ struct io_kiocb {
> >>         struct io_kiocb                 *link;
> >>         /* custom credentials, valid IFF REQ_F_CREDS is set */
> >>         const struct cred               *creds;
> >> -       struct io_wq_work               work;
> >> +
> >> +       union {
> >> +               struct io_wq_work       work;
> >> +               freeptr_t               freeptr __aligned(sizeof(freeptr_t));
> >
> > I'd rather add the __aligned() to the definition of freeptr_t, so it
> > applies to all (future) users.
> >
> > But my main question stays: why is the slab code checking
> > IS_ALIGNED(args->freeptr_offset, sizeof(freeptr_t)?
>
> I believe it's to match how SLUB normally calculates the offset if no
> explicit one is given, in calculate_sizes():
>
> s->offset = ALIGN_DOWN(s->object_size / 2, sizeof(void *));
>
> Yes there's a sizeof(void *) because freepointer used to be just that and we
> forgot to update this place when freepointer_t was introduced (by Jann in
> 44f6a42d49350) for handling CONFIG_SLAB_FREELIST_HARDENED. In
> get_freepointer() you can see how there's a cast to a pointer eventually.
>
> Does m68k have different alignment for pointer and unsigned long or both are
> 2 bytes? Or any other arch, i.e. should get_freepointer be a union with
> unsigned long and void * instead? (or it doesn't matter?)

The default alignment for int, long, and pointer is 2 on m68k.
On CRIS (no longer supported by Linux), it was 1, IIRC.
So the union won't make a difference.

> > Perhaps that was just intended to be __alignof__ instead of sizeof()?
>
> Would it do the right thing everywhere, given the explanation above?

It depends. Does anything rely on the offset being a multiple of (at
least) 4?
E.g. does anything counts in multiples of longs (hi BCPL! ;-), or are
the 2 LSB used for a special purpose? (cfr. maple_tree, which uses
bit 0 (https://elixir.bootlin.com/linux/v6.12/source/include/linux/maple_tree.h#L46)?

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- [email protected]

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 03/14] io_uring: specify freeptr usage for SLAB_TYPESAFE_BY_RCU io_kiocb cache
  2024-11-20  9:07                                   ` Geert Uytterhoeven
@ 2024-11-20  9:37                                     ` Vlastimil Babka
  2024-11-20 12:48                                       ` Geert Uytterhoeven
  0 siblings, 1 reply; 36+ messages in thread
From: Vlastimil Babka @ 2024-11-20  9:37 UTC (permalink / raw)
  To: Geert Uytterhoeven, Kees Cook
  Cc: Jens Axboe, Jann Horn, Guenter Roeck, io-uring, linux-m68k,
	Christian Brauner, Linux MM, Linux Kernel Mailing List

On 11/20/24 10:07, Geert Uytterhoeven wrote:
> Hi Vlastimil,
> 
>> >>
>> >> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
>> >> index 593c10a02144..8ed9c6923668 100644
>> >> --- a/include/linux/io_uring_types.h
>> >> +++ b/include/linux/io_uring_types.h
>> >> @@ -674,7 +674,11 @@ struct io_kiocb {
>> >>         struct io_kiocb                 *link;
>> >>         /* custom credentials, valid IFF REQ_F_CREDS is set */
>> >>         const struct cred               *creds;
>> >> -       struct io_wq_work               work;
>> >> +
>> >> +       union {
>> >> +               struct io_wq_work       work;
>> >> +               freeptr_t               freeptr __aligned(sizeof(freeptr_t));
>> >
>> > I'd rather add the __aligned() to the definition of freeptr_t, so it
>> > applies to all (future) users.
>> >
>> > But my main question stays: why is the slab code checking
>> > IS_ALIGNED(args->freeptr_offset, sizeof(freeptr_t)?
>>
>> I believe it's to match how SLUB normally calculates the offset if no
>> explicit one is given, in calculate_sizes():
>>
>> s->offset = ALIGN_DOWN(s->object_size / 2, sizeof(void *));
>>
>> Yes there's a sizeof(void *) because freepointer used to be just that and we
>> forgot to update this place when freepointer_t was introduced (by Jann in
>> 44f6a42d49350) for handling CONFIG_SLAB_FREELIST_HARDENED. In
>> get_freepointer() you can see how there's a cast to a pointer eventually.
>>
>> Does m68k have different alignment for pointer and unsigned long or both are
>> 2 bytes? Or any other arch, i.e. should get_freepointer be a union with
>> unsigned long and void * instead? (or it doesn't matter?)
> 
> The default alignment for int, long, and pointer is 2 on m68k.
> On CRIS (no longer supported by Linux), it was 1, IIRC.
> So the union won't make a difference.
> 
>> > Perhaps that was just intended to be __alignof__ instead of sizeof()?
>>
>> Would it do the right thing everywhere, given the explanation above?
> 
> It depends. Does anything rely on the offset being a multiple of (at
> least) 4?
> E.g. does anything counts in multiples of longs (hi BCPL! ;-), or are
> the 2 LSB used for a special purpose? (cfr. maple_tree, which uses
> bit 0 (https://elixir.bootlin.com/linux/v6.12/source/include/linux/maple_tree.h#L46)?

AFAIK no, the goal was just to prevent misaligned accesses. Kees added the:

s->offset = ALIGN_DOWN(s->object_size / 2, sizeof(void *));

so maybe he had something else in mind. But I suspect it was just because
the code already used it elsewhere.

So we might want something like this? But that would be safer for 6.14 so
I'd suggest the io_uring specific fix meanwhile. Or maybe just add the union
with freeptr_t but without __aligned plus the part below that changes
mm/slab_common.c only, as the 6.13 io_uring fix?

diff --git a/mm/slab_common.c b/mm/slab_common.c
index 893d32059915..477fa471da18 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -230,7 +230,7 @@ static struct kmem_cache *create_cache(const char *name,
 	if (args->use_freeptr_offset &&
 	    (args->freeptr_offset >= object_size ||
 	     !(flags & SLAB_TYPESAFE_BY_RCU) ||
-	     !IS_ALIGNED(args->freeptr_offset, sizeof(freeptr_t))))
+	     !IS_ALIGNED(args->freeptr_offset, __alignof__(freeptr_t))))
 		goto out;
 
 	err = -ENOMEM;
diff --git a/mm/slub.c b/mm/slub.c
index 5b832512044e..6ad904be7700 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -5287,11 +5287,7 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s)
 	unsigned int size = s->object_size;
 	unsigned int order;
 
-	/*
-	 * Round up object size to the next word boundary. We can only
-	 * place the free pointer at word boundaries and this determines
-	 * the possible location of the free pointer.
-	 */
+	/* Round up object size to the next word boundary. */
 	size = ALIGN(size, sizeof(void *));
 
 #ifdef CONFIG_SLUB_DEBUG
@@ -5325,7 +5321,7 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s)
 	if (((flags & SLAB_TYPESAFE_BY_RCU) && !args->use_freeptr_offset) ||
 	    (flags & SLAB_POISON) || s->ctor ||
 	    ((flags & SLAB_RED_ZONE) &&
-	     (s->object_size < sizeof(void *) || slub_debug_orig_size(s)))) {
+	     (s->object_size < sizeof(freeptr_t) || slub_debug_orig_size(s)))) {
 		/*
 		 * Relocate free pointer after the object if it is not
 		 * permitted to overwrite the first word of the object on
@@ -5343,7 +5339,7 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s)
 		 * longer true, the function needs to be modified.
 		 */
 		s->offset = size;
-		size += sizeof(void *);
+		size += sizeof(freeptr_t);
 	} else if ((flags & SLAB_TYPESAFE_BY_RCU) && args->use_freeptr_offset) {
 		s->offset = args->freeptr_offset;
 	} else {
@@ -5352,7 +5348,7 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s)
 		 * it away from the edges of the object to avoid small
 		 * sized over/underflows from neighboring allocations.
 		 */
-		s->offset = ALIGN_DOWN(s->object_size / 2, sizeof(void *));
+		s->offset = ALIGN_DOWN(s->object_size / 2, __alignof__(freeptr_t));
 	}
 
 #ifdef CONFIG_SLUB_DEBUG




^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH 03/14] io_uring: specify freeptr usage for SLAB_TYPESAFE_BY_RCU io_kiocb cache
  2024-11-20  9:37                                     ` Vlastimil Babka
@ 2024-11-20 12:48                                       ` Geert Uytterhoeven
  0 siblings, 0 replies; 36+ messages in thread
From: Geert Uytterhoeven @ 2024-11-20 12:48 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Kees Cook, Jens Axboe, Jann Horn, Guenter Roeck, io-uring,
	linux-m68k, Christian Brauner, Linux MM,
	Linux Kernel Mailing List

Hi Vlastimil,

On Wed, Nov 20, 2024 at 10:37 AM Vlastimil Babka <[email protected]> wrote:
> On 11/20/24 10:07, Geert Uytterhoeven wrote:
> >> >> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
> >> >> index 593c10a02144..8ed9c6923668 100644
> >> >> --- a/include/linux/io_uring_types.h
> >> >> +++ b/include/linux/io_uring_types.h
> >> >> @@ -674,7 +674,11 @@ struct io_kiocb {
> >> >>         struct io_kiocb                 *link;
> >> >>         /* custom credentials, valid IFF REQ_F_CREDS is set */
> >> >>         const struct cred               *creds;
> >> >> -       struct io_wq_work               work;
> >> >> +
> >> >> +       union {
> >> >> +               struct io_wq_work       work;
> >> >> +               freeptr_t               freeptr __aligned(sizeof(freeptr_t));
> >> >
> >> > I'd rather add the __aligned() to the definition of freeptr_t, so it
> >> > applies to all (future) users.
> >> >
> >> > But my main question stays: why is the slab code checking
> >> > IS_ALIGNED(args->freeptr_offset, sizeof(freeptr_t)?
> >>
> >> I believe it's to match how SLUB normally calculates the offset if no
> >> explicit one is given, in calculate_sizes():
> >>
> >> s->offset = ALIGN_DOWN(s->object_size / 2, sizeof(void *));
> >>
> >> Yes there's a sizeof(void *) because freepointer used to be just that and we
> >> forgot to update this place when freepointer_t was introduced (by Jann in
> >> 44f6a42d49350) for handling CONFIG_SLAB_FREELIST_HARDENED. In
> >> get_freepointer() you can see how there's a cast to a pointer eventually.
> >>
> >> Does m68k have different alignment for pointer and unsigned long or both are
> >> 2 bytes? Or any other arch, i.e. should get_freepointer be a union with
> >> unsigned long and void * instead? (or it doesn't matter?)
> >
> > The default alignment for int, long, and pointer is 2 on m68k.
> > On CRIS (no longer supported by Linux), it was 1, IIRC.
> > So the union won't make a difference.
> >
> >> > Perhaps that was just intended to be __alignof__ instead of sizeof()?
> >>
> >> Would it do the right thing everywhere, given the explanation above?
> >
> > It depends. Does anything rely on the offset being a multiple of (at
> > least) 4?
> > E.g. does anything counts in multiples of longs (hi BCPL! ;-), or are
> > the 2 LSB used for a special purpose? (cfr. maple_tree, which uses
> > bit 0 (https://elixir.bootlin.com/linux/v6.12/source/include/linux/maple_tree.h#L46)?
>
> AFAIK no, the goal was just to prevent misaligned accesses. Kees added the:
>
> s->offset = ALIGN_DOWN(s->object_size / 2, sizeof(void *));
>
> so maybe he had something else in mind. But I suspect it was just because
> the code already used it elsewhere.
>
> So we might want something like this? But that would be safer for 6.14 so
> I'd suggest the io_uring specific fix meanwhile. Or maybe just add the union
> with freeptr_t but without __aligned plus the part below that changes
> mm/slab_common.c only, as the 6.13 io_uring fix?

As it seems to work fine with s/sizeof/__alignof/, I have submitted
a patch to just make that change
https://lore.kernel.org/80c767a5d5927c099aea5178fbf2c897b459fa90.1732106544.git.geert@linux-m68k.org

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- [email protected]

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply	[flat|nested] 36+ messages in thread

end of thread, other threads:[~2024-11-20 12:48 UTC | newest]

Thread overview: 36+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-10-29 15:16 [PATCHSET v3 0/14] Rewrite rsrc node handling Jens Axboe
2024-10-29 15:16 ` [PATCH 01/14] io_uring/nop: add support for testing registered files and buffers Jens Axboe
2024-10-29 15:16 ` [PATCH 02/14] io_uring/rsrc: move struct io_fixed_file to rsrc.h header Jens Axboe
2024-10-29 15:16 ` [PATCH 03/14] io_uring: specify freeptr usage for SLAB_TYPESAFE_BY_RCU io_kiocb cache Jens Axboe
2024-11-19 15:36   ` Guenter Roeck
2024-11-19 16:02     ` Jens Axboe
2024-11-19 16:21       ` Guenter Roeck
2024-11-19 17:49         ` Geert Uytterhoeven
2024-11-19 19:00           ` Jens Axboe
2024-11-19 19:02             ` Geert Uytterhoeven
2024-11-19 19:10               ` Jens Axboe
2024-11-19 19:25                 ` Geert Uytterhoeven
2024-11-19 19:30                   ` Jens Axboe
2024-11-19 19:41                     ` Geert Uytterhoeven
2024-11-19 19:44                       ` Jens Axboe
2024-11-19 19:49                         ` Jens Axboe
2024-11-19 21:46                           ` Guenter Roeck
2024-11-19 22:30                             ` Jens Axboe
2024-11-20  0:08                               ` Guenter Roeck
2024-11-20  1:58                                 ` Jens Axboe
2024-11-20  8:19                               ` Geert Uytterhoeven
2024-11-20  8:47                                 ` Vlastimil Babka
2024-11-20  9:07                                   ` Geert Uytterhoeven
2024-11-20  9:37                                     ` Vlastimil Babka
2024-11-20 12:48                                       ` Geert Uytterhoeven
2024-10-29 15:16 ` [PATCH 04/14] io_uring/splice: open code 2nd direct file assignment Jens Axboe
2024-10-29 15:16 ` [PATCH 05/14] io_uring/rsrc: kill io_charge_rsrc_node() Jens Axboe
2024-10-29 15:16 ` [PATCH 06/14] io_uring/rsrc: get rid of per-ring io_rsrc_node list Jens Axboe
2024-10-29 15:16 ` [PATCH 07/14] io_uring/rsrc: get rid of io_rsrc_node allocation cache Jens Axboe
2024-10-29 15:16 ` [PATCH 08/14] io_uring/rsrc: add an empty io_rsrc_node for sparse buffer entries Jens Axboe
2024-10-29 15:16 ` [PATCH 09/14] io_uring: only initialize io_kiocb rsrc_nodes when needed Jens Axboe
2024-10-29 15:16 ` [PATCH 10/14] io_uring/rsrc: unify file and buffer resource tables Jens Axboe
2024-10-29 15:16 ` [PATCH 11/14] io_uring/rsrc: add io_rsrc_node_lookup() helper Jens Axboe
2024-10-29 15:16 ` [PATCH 12/14] io_uring/filetable: remove io_file_from_index() helper Jens Axboe
2024-10-29 15:16 ` [PATCH 13/14] io_uring/filetable: kill io_reset_alloc_hint() helper Jens Axboe
2024-10-29 15:16 ` [PATCH 14/14] io_uring/rsrc: add io_reset_rsrc_node() helper Jens Axboe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox