* [PATCH] fs: support filename refcount without atomics
@ 2025-03-07 16:11 Mateusz Guzik
2025-03-07 16:18 ` Jens Axboe
` (4 more replies)
0 siblings, 5 replies; 14+ messages in thread
From: Mateusz Guzik @ 2025-03-07 16:11 UTC (permalink / raw)
To: brauner
Cc: viro, jack, linux-kernel, linux-fsdevel, io-uring, audit, axboe,
Mateusz Guzik
Atomics are only needed for a combination of io_uring and audit.
Regular file access (even with audit) gets around fine without them.
With this patch 'struct filename' starts with being refcounted using
regular ops.
In order to avoid API explosion in the getname*() family, a dedicated
routine is added to switch the obj to use atomics.
This leaves the room for merely issuing getname(), not issuing the
switch and still trying to manipulate the refcount from another thread.
Catching such cases is facilitated by CONFIG_DEBUG_VFS-dependent
tracking of who created the given filename object and having refname()
and putname() detect if another thread is trying to modify them.
Benchmarked on Sapphire Rapids issuing access() (ops/s):
before: 5106246
after: 5269678 (+3%)
Signed-off-by: Mateusz Guzik <[email protected]>
---
this can be split into 2 patches (refname, initname vs the atomic
change). I also took the liberty to fix up some weird whitespace.
the bench is access1 from will-it-scale (kind of):
https://github.com/antonblanchard/will-it-scale/pull/36
correctness tested with io_uring + audit using the test suite in liburing.
Confirmed mismatched owners show up:
# bpftrace -e 'kprobe:putname /curtask != ((struct filename *)arg0)->owner/ { @[kstack()] = count(); }'
Attaching 1 probe...
^C
@[
putname+5
do_renameat2+279
io_renameat+40
io_issue_sqe+1159
io_wq_submit_work+200
io_worker_handle_work+313
io_wq_worker+218
ret_from_fork+49
ret_from_fork_asm+26
]: 4
@[
putname+5
do_renameat2+287
io_renameat+40
io_issue_sqe+1159
io_wq_submit_work+200
io_worker_handle_work+313
io_wq_worker+218
ret_from_fork+49
ret_from_fork_asm+26
]: 4
fs/namei.c | 44 +++++++++++++++++++++++++++++++++-----------
include/linux/fs.h | 37 ++++++++++++++++++++++++++++++++++++-
io_uring/fs.c | 8 ++++++++
io_uring/openclose.c | 1 +
io_uring/statx.c | 3 +--
io_uring/xattr.c | 2 ++
kernel/auditsc.c | 12 +++++-------
7 files changed, 86 insertions(+), 21 deletions(-)
diff --git a/fs/namei.c b/fs/namei.c
index 06765d320e7e..ff76b495abef 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -125,6 +125,17 @@
#define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname))
+static inline void initname(struct filename *name)
+{
+ name->uptr = NULL;
+ name->aname = NULL;
+ name->is_atomic = false;
+#ifdef CONFIG_DEBUG_VFS
+ name->owner = current;
+#endif
+ atomic_set(&name->refcnt_atomic, 1);
+}
+
struct filename *
getname_flags(const char __user *filename, int flags)
{
@@ -203,10 +214,7 @@ getname_flags(const char __user *filename, int flags)
return ERR_PTR(-ENAMETOOLONG);
}
}
-
- atomic_set(&result->refcnt, 1);
- result->uptr = filename;
- result->aname = NULL;
+ initname(result);
audit_getname(result);
return result;
}
@@ -264,26 +272,40 @@ struct filename *getname_kernel(const char * filename)
return ERR_PTR(-ENAMETOOLONG);
}
memcpy((char *)result->name, filename, len);
- result->uptr = NULL;
- result->aname = NULL;
- atomic_set(&result->refcnt, 1);
+ initname(result);
audit_getname(result);
-
return result;
}
EXPORT_SYMBOL(getname_kernel);
void putname(struct filename *name)
{
+ int refcnt;
+
if (IS_ERR_OR_NULL(name))
return;
- if (WARN_ON_ONCE(!atomic_read(&name->refcnt)))
- return;
+ VFS_BUG_ON(name->owner != current && !name->is_atomic);
- if (!atomic_dec_and_test(&name->refcnt))
+ refcnt = atomic_read(&name->refcnt_atomic);
+ if (WARN_ON_ONCE(!refcnt))
return;
+ /*
+ * If refcnt == 1 then there is nobody who can legally change it.
+ * Short-circuiting this case saves a branch in the common case and
+ * an atomic op for the last unref (for the ->is_atomic variant).
+ */
+ if (refcnt != 1) {
+ if (name->is_atomic) {
+ if (!atomic_dec_and_test(&name->refcnt_atomic))
+ return;
+ } else {
+ if (--name->refcnt)
+ return;
+ }
+ }
+
if (name->name != name->iname) {
__putname(name->name);
kfree(name);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 110d95d04299..b559a611dd15 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2765,11 +2765,19 @@ struct audit_names;
struct filename {
const char *name; /* pointer to actual string */
const __user char *uptr; /* original userland pointer */
- atomic_t refcnt;
+ union {
+ atomic_t refcnt_atomic;
+ int refcnt;
+ };
+#ifdef CONFIG_DEBUG_VFS
+ struct task_struct *owner;
+#endif
+ bool is_atomic;
struct audit_names *aname;
const char iname[];
};
static_assert(offsetof(struct filename, iname) % sizeof(long) == 0);
+static_assert(sizeof(int) == sizeof(atomic_t)); /* refcount */
static inline struct mnt_idmap *file_mnt_idmap(const struct file *file)
{
@@ -2864,6 +2872,33 @@ static inline struct filename *getname_maybe_null(const char __user *name, int f
extern void putname(struct filename *name);
DEFINE_FREE(putname, struct filename *, if (!IS_ERR_OR_NULL(_T)) putname(_T))
+static inline void makeatomicname(struct filename *name)
+{
+ VFS_BUG_ON(IS_ERR_OR_NULL(name));
+ /*
+ * The name can legitimately already be atomic if it was cached by audit.
+ * If switching the refcount to atomic, we need not to know we are the
+ * only non-atomic user.
+ */
+ VFS_BUG_ON(name->owner != current && !name->is_atomic);
+ /*
+ * Don't bother branching, this is a store to an already dirtied cacheline.
+ */
+ name->is_atomic = true;
+}
+
+static inline struct filename *refname(struct filename *name)
+{
+ VFS_BUG_ON(name->owner != current && !name->is_atomic);
+
+ if (name->is_atomic)
+ atomic_inc(&name->refcnt_atomic);
+ else
+ name->refcnt++;
+
+ return name;
+}
+
extern int finish_open(struct file *file, struct dentry *dentry,
int (*open)(struct inode *, struct file *));
extern int finish_no_open(struct file *file, struct dentry *dentry);
diff --git a/io_uring/fs.c b/io_uring/fs.c
index eccea851dd5a..db8d4fe4290d 100644
--- a/io_uring/fs.c
+++ b/io_uring/fs.c
@@ -66,12 +66,14 @@ int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
ren->oldpath = getname(oldf);
if (IS_ERR(ren->oldpath))
return PTR_ERR(ren->oldpath);
+ makeatomicname(ren->oldpath);
ren->newpath = getname(newf);
if (IS_ERR(ren->newpath)) {
putname(ren->oldpath);
return PTR_ERR(ren->newpath);
}
+ makeatomicname(ren->newpath);
req->flags |= REQ_F_NEED_CLEANUP;
req->flags |= REQ_F_FORCE_ASYNC;
@@ -121,6 +123,7 @@ int io_unlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
un->filename = getname(fname);
if (IS_ERR(un->filename))
return PTR_ERR(un->filename);
+ makeatomicname(un->filename);
req->flags |= REQ_F_NEED_CLEANUP;
req->flags |= REQ_F_FORCE_ASYNC;
@@ -168,6 +171,7 @@ int io_mkdirat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
mkd->filename = getname(fname);
if (IS_ERR(mkd->filename))
return PTR_ERR(mkd->filename);
+ makeatomicname(mkd->filename);
req->flags |= REQ_F_NEED_CLEANUP;
req->flags |= REQ_F_FORCE_ASYNC;
@@ -212,12 +216,14 @@ int io_symlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
sl->oldpath = getname(oldpath);
if (IS_ERR(sl->oldpath))
return PTR_ERR(sl->oldpath);
+ makeatomicname(sl->oldpath);
sl->newpath = getname(newpath);
if (IS_ERR(sl->newpath)) {
putname(sl->oldpath);
return PTR_ERR(sl->newpath);
}
+ makeatomicname(sl->newpath);
req->flags |= REQ_F_NEED_CLEANUP;
req->flags |= REQ_F_FORCE_ASYNC;
@@ -257,12 +263,14 @@ int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
lnk->oldpath = getname_uflags(oldf, lnk->flags);
if (IS_ERR(lnk->oldpath))
return PTR_ERR(lnk->oldpath);
+ makeatomicname(lnk->oldpath);
lnk->newpath = getname(newf);
if (IS_ERR(lnk->newpath)) {
putname(lnk->oldpath);
return PTR_ERR(lnk->newpath);
}
+ makeatomicname(lnk->newpath);
req->flags |= REQ_F_NEED_CLEANUP;
req->flags |= REQ_F_FORCE_ASYNC;
diff --git a/io_uring/openclose.c b/io_uring/openclose.c
index e3357dfa14ca..d1213bd2911b 100644
--- a/io_uring/openclose.c
+++ b/io_uring/openclose.c
@@ -70,6 +70,7 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
open->filename = NULL;
return ret;
}
+ makeatomicname(open->filename);
open->file_slot = READ_ONCE(sqe->file_index);
if (open->file_slot && (open->how.flags & O_CLOEXEC))
diff --git a/io_uring/statx.c b/io_uring/statx.c
index 6bc4651700a2..892c85a29e5f 100644
--- a/io_uring/statx.c
+++ b/io_uring/statx.c
@@ -37,13 +37,12 @@ int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
sx->flags = READ_ONCE(sqe->statx_flags);
sx->filename = getname_uflags(path, sx->flags);
-
if (IS_ERR(sx->filename)) {
int ret = PTR_ERR(sx->filename);
-
sx->filename = NULL;
return ret;
}
+ makeatomicname(sx->filename);
req->flags |= REQ_F_NEED_CLEANUP;
req->flags |= REQ_F_FORCE_ASYNC;
diff --git a/io_uring/xattr.c b/io_uring/xattr.c
index de5064fcae8a..73acc6036187 100644
--- a/io_uring/xattr.c
+++ b/io_uring/xattr.c
@@ -96,6 +96,7 @@ int io_getxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
ix->filename = getname(path);
if (IS_ERR(ix->filename))
return PTR_ERR(ix->filename);
+ makeatomicname(ix->filename);
return 0;
}
@@ -172,6 +173,7 @@ int io_setxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
ix->filename = getname(path);
if (IS_ERR(ix->filename))
return PTR_ERR(ix->filename);
+ makeatomicname(ix->filename);
return 0;
}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 9c853cde9abe..78fd876a5473 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2207,10 +2207,8 @@ __audit_reusename(const __user char *uptr)
list_for_each_entry(n, &context->names_list, list) {
if (!n->name)
continue;
- if (n->name->uptr == uptr) {
- atomic_inc(&n->name->refcnt);
- return n->name;
- }
+ if (n->name->uptr == uptr)
+ return refname(n->name);
}
return NULL;
}
@@ -2237,7 +2235,7 @@ void __audit_getname(struct filename *name)
n->name = name;
n->name_len = AUDIT_NAME_FULL;
name->aname = n;
- atomic_inc(&name->refcnt);
+ refname(name);
}
static inline int audit_copy_fcaps(struct audit_names *name,
@@ -2369,7 +2367,7 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
return;
if (name) {
n->name = name;
- atomic_inc(&name->refcnt);
+ refname(name);
}
out:
@@ -2496,7 +2494,7 @@ void __audit_inode_child(struct inode *parent,
if (found_parent) {
found_child->name = found_parent->name;
found_child->name_len = AUDIT_NAME_FULL;
- atomic_inc(&found_child->name->refcnt);
+ refname(found_child->name);
}
}
--
2.43.0
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH] fs: support filename refcount without atomics
2025-03-07 16:11 [PATCH] fs: support filename refcount without atomics Mateusz Guzik
@ 2025-03-07 16:18 ` Jens Axboe
2025-03-07 16:25 ` Mateusz Guzik
2025-03-07 16:26 ` Matthew Wilcox
` (3 subsequent siblings)
4 siblings, 1 reply; 14+ messages in thread
From: Jens Axboe @ 2025-03-07 16:18 UTC (permalink / raw)
To: Mateusz Guzik, brauner
Cc: viro, jack, linux-kernel, linux-fsdevel, io-uring, audit
> +static inline void makeatomicname(struct filename *name)
> +{
> + VFS_BUG_ON(IS_ERR_OR_NULL(name));
> + /*
> + * The name can legitimately already be atomic if it was cached by audit.
> + * If switching the refcount to atomic, we need not to know we are the
> + * only non-atomic user.
> + */
> + VFS_BUG_ON(name->owner != current && !name->is_atomic);
> + /*
> + * Don't bother branching, this is a store to an already dirtied cacheline.
> + */
> + name->is_atomic = true;
> +}
Should this not depend on audit being enabled? io_uring without audit is
fine.
--
Jens Axboe
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] fs: support filename refcount without atomics
2025-03-07 16:18 ` Jens Axboe
@ 2025-03-07 16:25 ` Mateusz Guzik
2025-03-07 16:32 ` Jens Axboe
0 siblings, 1 reply; 14+ messages in thread
From: Mateusz Guzik @ 2025-03-07 16:25 UTC (permalink / raw)
To: Jens Axboe
Cc: brauner, viro, jack, linux-kernel, linux-fsdevel, io-uring, audit
On Fri, Mar 7, 2025 at 5:18 PM Jens Axboe <[email protected]> wrote:
>
> > +static inline void makeatomicname(struct filename *name)
> > +{
> > + VFS_BUG_ON(IS_ERR_OR_NULL(name));
> > + /*
> > + * The name can legitimately already be atomic if it was cached by audit.
> > + * If switching the refcount to atomic, we need not to know we are the
> > + * only non-atomic user.
> > + */
> > + VFS_BUG_ON(name->owner != current && !name->is_atomic);
> > + /*
> > + * Don't bother branching, this is a store to an already dirtied cacheline.
> > + */
> > + name->is_atomic = true;
> > +}
>
> Should this not depend on audit being enabled? io_uring without audit is
> fine.
>
I thought about it, but then I got worried about transitions from
disabled to enabled -- will they suddenly start looking here? Should
this test for audit_enabled, audit_dummy_context() or something else?
I did not want to bother analyzing this.
I'll note though this would be an optimization on top of the current
code, so I don't think it *blocks* the patch.
--
Mateusz Guzik <mjguzik gmail.com>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] fs: support filename refcount without atomics
2025-03-07 16:11 [PATCH] fs: support filename refcount without atomics Mateusz Guzik
2025-03-07 16:18 ` Jens Axboe
@ 2025-03-07 16:26 ` Matthew Wilcox
2025-03-07 16:32 ` Mateusz Guzik
2025-03-07 16:42 ` Al Viro
` (2 subsequent siblings)
4 siblings, 1 reply; 14+ messages in thread
From: Matthew Wilcox @ 2025-03-07 16:26 UTC (permalink / raw)
To: Mateusz Guzik
Cc: brauner, viro, jack, linux-kernel, linux-fsdevel, io-uring, audit,
axboe
On Fri, Mar 07, 2025 at 05:11:55PM +0100, Mateusz Guzik wrote:
> +++ b/include/linux/fs.h
> @@ -2765,11 +2765,19 @@ struct audit_names;
> struct filename {
> const char *name; /* pointer to actual string */
> const __user char *uptr; /* original userland pointer */
> - atomic_t refcnt;
> + union {
> + atomic_t refcnt_atomic;
> + int refcnt;
> + };
> +#ifdef CONFIG_DEBUG_VFS
> + struct task_struct *owner;
> +#endif
> + bool is_atomic;
> struct audit_names *aname;
> const char iname[];
> };
7 (or 3) byte hole; try to pad.
Would it make more sense to put the bool between aname and iname where
it will only take one byte instead of 8? Actually, maybe do it as:
struct filename {
const char *name; /* pointer to actual string */
const __user char *uptr; /* original userland pointer */
- atomic_t refcnt;
struct audit_names *aname;
+#ifdef CONFIG_DEBUG_VFS
+ struct task_struct *owner;
+#endif
+ union {
+ atomic_t refcnt_atomic;
+ int refcnt;
+ };
+ bool is_atomic;
const char iname[];
};
and (on 64-bit), you're packed even more efficiently than right now.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] fs: support filename refcount without atomics
2025-03-07 16:26 ` Matthew Wilcox
@ 2025-03-07 16:32 ` Mateusz Guzik
0 siblings, 0 replies; 14+ messages in thread
From: Mateusz Guzik @ 2025-03-07 16:32 UTC (permalink / raw)
To: Matthew Wilcox
Cc: brauner, viro, jack, linux-kernel, linux-fsdevel, io-uring, audit,
axboe
On Fri, Mar 7, 2025 at 5:26 PM Matthew Wilcox <[email protected]> wrote:
>
> On Fri, Mar 07, 2025 at 05:11:55PM +0100, Mateusz Guzik wrote:
> > +++ b/include/linux/fs.h
> > @@ -2765,11 +2765,19 @@ struct audit_names;
> > struct filename {
> > const char *name; /* pointer to actual string */
> > const __user char *uptr; /* original userland pointer */
> > - atomic_t refcnt;
> > + union {
> > + atomic_t refcnt_atomic;
> > + int refcnt;
> > + };
> > +#ifdef CONFIG_DEBUG_VFS
> > + struct task_struct *owner;
> > +#endif
> > + bool is_atomic;
> > struct audit_names *aname;
> > const char iname[];
> > };
>
> 7 (or 3) byte hole; try to pad.
>
> Would it make more sense to put the bool between aname and iname where
> it will only take one byte instead of 8?
On the stock kernel there is already a 4 byte hole between the
refcount and aname, which is where is_atomic lands with debug
disabled. I.e. no size changes in production kernels with and without
the change.
However, now that you mention it the debug owner field is misplaced --
it should have landed *after* is_atomic. Maybe Christian will be happy
to just move it, otherwise I'm going to include this in a v2.
The iname field is expected to be aligned, so I don't believe
shuffling the is_atomic flag helps anyone:
static_assert(offsetof(struct filename, iname) % sizeof(long) == 0);
--
Mateusz Guzik <mjguzik gmail.com>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] fs: support filename refcount without atomics
2025-03-07 16:25 ` Mateusz Guzik
@ 2025-03-07 16:32 ` Jens Axboe
2025-03-07 16:35 ` Mateusz Guzik
0 siblings, 1 reply; 14+ messages in thread
From: Jens Axboe @ 2025-03-07 16:32 UTC (permalink / raw)
To: Mateusz Guzik
Cc: brauner, viro, jack, linux-kernel, linux-fsdevel, io-uring, audit
On 3/7/25 9:25 AM, Mateusz Guzik wrote:
> On Fri, Mar 7, 2025 at 5:18?PM Jens Axboe <[email protected]> wrote:
>>
>>> +static inline void makeatomicname(struct filename *name)
>>> +{
>>> + VFS_BUG_ON(IS_ERR_OR_NULL(name));
>>> + /*
>>> + * The name can legitimately already be atomic if it was cached by audit.
>>> + * If switching the refcount to atomic, we need not to know we are the
>>> + * only non-atomic user.
>>> + */
>>> + VFS_BUG_ON(name->owner != current && !name->is_atomic);
>>> + /*
>>> + * Don't bother branching, this is a store to an already dirtied cacheline.
>>> + */
>>> + name->is_atomic = true;
>>> +}
>>
>> Should this not depend on audit being enabled? io_uring without audit is
>> fine.
>>
>
> I thought about it, but then I got worried about transitions from
> disabled to enabled -- will they suddenly start looking here? Should
> this test for audit_enabled, audit_dummy_context() or something else?
> I did not want to bother analyzing this.
Let me take a look at it, the markings for when to switch atomic are not
accurate - it only really needs to happen for offload situations only,
and if audit is enabled and tracking. So I think we can great improve
upon this patch.
> I'll note though this would be an optimization on top of the current
> code, so I don't think it *blocks* the patch.
Let's not go with something half-done if we can get it right the first
time.
--
Jens Axboe
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] fs: support filename refcount without atomics
2025-03-07 16:32 ` Jens Axboe
@ 2025-03-07 16:35 ` Mateusz Guzik
2025-03-07 16:38 ` Jens Axboe
0 siblings, 1 reply; 14+ messages in thread
From: Mateusz Guzik @ 2025-03-07 16:35 UTC (permalink / raw)
To: Jens Axboe
Cc: brauner, viro, jack, linux-kernel, linux-fsdevel, io-uring, audit
On Fri, Mar 7, 2025 at 5:32 PM Jens Axboe <[email protected]> wrote:
>
> On 3/7/25 9:25 AM, Mateusz Guzik wrote:
> > On Fri, Mar 7, 2025 at 5:18?PM Jens Axboe <[email protected]> wrote:
> >>
> >>> +static inline void makeatomicname(struct filename *name)
> >>> +{
> >>> + VFS_BUG_ON(IS_ERR_OR_NULL(name));
> >>> + /*
> >>> + * The name can legitimately already be atomic if it was cached by audit.
> >>> + * If switching the refcount to atomic, we need not to know we are the
> >>> + * only non-atomic user.
> >>> + */
> >>> + VFS_BUG_ON(name->owner != current && !name->is_atomic);
> >>> + /*
> >>> + * Don't bother branching, this is a store to an already dirtied cacheline.
> >>> + */
> >>> + name->is_atomic = true;
> >>> +}
> >>
> >> Should this not depend on audit being enabled? io_uring without audit is
> >> fine.
> >>
> >
> > I thought about it, but then I got worried about transitions from
> > disabled to enabled -- will they suddenly start looking here? Should
> > this test for audit_enabled, audit_dummy_context() or something else?
> > I did not want to bother analyzing this.
>
> Let me take a look at it, the markings for when to switch atomic are not
> accurate - it only really needs to happen for offload situations only,
> and if audit is enabled and tracking. So I think we can great improve
> upon this patch.
>
I aimed for this to be a NOP for io_uring, so to speak, specifically
because I could not be arsed to deal with audit.
> > I'll note though this would be an optimization on top of the current
> > code, so I don't think it *blocks* the patch.
>
> Let's not go with something half-done if we can get it right the first
> time.
>
Since you volunteered to sort this out, I'll be happy to wait.
--
Mateusz Guzik <mjguzik gmail.com>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] fs: support filename refcount without atomics
2025-03-07 16:35 ` Mateusz Guzik
@ 2025-03-07 16:38 ` Jens Axboe
2025-03-07 16:39 ` Mateusz Guzik
0 siblings, 1 reply; 14+ messages in thread
From: Jens Axboe @ 2025-03-07 16:38 UTC (permalink / raw)
To: Mateusz Guzik
Cc: brauner, viro, jack, linux-kernel, linux-fsdevel, io-uring, audit
On 3/7/25 9:35 AM, Mateusz Guzik wrote:
> On Fri, Mar 7, 2025 at 5:32?PM Jens Axboe <[email protected]> wrote:
>>
>> On 3/7/25 9:25 AM, Mateusz Guzik wrote:
>>> On Fri, Mar 7, 2025 at 5:18?PM Jens Axboe <[email protected]> wrote:
>>>>
>>>>> +static inline void makeatomicname(struct filename *name)
>>>>> +{
>>>>> + VFS_BUG_ON(IS_ERR_OR_NULL(name));
>>>>> + /*
>>>>> + * The name can legitimately already be atomic if it was cached by audit.
>>>>> + * If switching the refcount to atomic, we need not to know we are the
>>>>> + * only non-atomic user.
>>>>> + */
>>>>> + VFS_BUG_ON(name->owner != current && !name->is_atomic);
>>>>> + /*
>>>>> + * Don't bother branching, this is a store to an already dirtied cacheline.
>>>>> + */
>>>>> + name->is_atomic = true;
>>>>> +}
>>>>
>>>> Should this not depend on audit being enabled? io_uring without audit is
>>>> fine.
>>>>
>>>
>>> I thought about it, but then I got worried about transitions from
>>> disabled to enabled -- will they suddenly start looking here? Should
>>> this test for audit_enabled, audit_dummy_context() or something else?
>>> I did not want to bother analyzing this.
>>
>> Let me take a look at it, the markings for when to switch atomic are not
>> accurate - it only really needs to happen for offload situations only,
>> and if audit is enabled and tracking. So I think we can great improve
>> upon this patch.
>>
>
> I aimed for this to be a NOP for io_uring, so to speak, specifically
> because I could not be arsed to deal with audit.
Hah I hear ya... But right now it seems to mark it atomic for any of the
fs based ops, which is not really necessary.
>>> I'll note though this would be an optimization on top of the current
>>> code, so I don't think it *blocks* the patch.
>>
>> Let's not go with something half-done if we can get it right the first
>> time.
>>
>
> Since you volunteered to sort this out, I'll be happy to wait.
I'll take a look start next week, don't think it should be too bad. You
already did 90% of the work.
--
Jens Axboe
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] fs: support filename refcount without atomics
2025-03-07 16:38 ` Jens Axboe
@ 2025-03-07 16:39 ` Mateusz Guzik
0 siblings, 0 replies; 14+ messages in thread
From: Mateusz Guzik @ 2025-03-07 16:39 UTC (permalink / raw)
To: Jens Axboe
Cc: brauner, viro, jack, linux-kernel, linux-fsdevel, io-uring, audit
On Fri, Mar 7, 2025 at 5:38 PM Jens Axboe <[email protected]> wrote:
>
> On 3/7/25 9:35 AM, Mateusz Guzik wrote:
> > Since you volunteered to sort this out, I'll be happy to wait.
>
> I'll take a look start next week, don't think it should be too bad. You
> already did 90% of the work.
>
Sounds good. In the mean time there may be io_uring-unrelated feedback.
--
Mateusz Guzik <mjguzik gmail.com>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] fs: support filename refcount without atomics
2025-03-07 16:11 [PATCH] fs: support filename refcount without atomics Mateusz Guzik
2025-03-07 16:18 ` Jens Axboe
2025-03-07 16:26 ` Matthew Wilcox
@ 2025-03-07 16:42 ` Al Viro
2025-03-07 16:44 ` Mateusz Guzik
2025-03-08 13:35 ` kernel test robot
2025-03-08 13:46 ` kernel test robot
4 siblings, 1 reply; 14+ messages in thread
From: Al Viro @ 2025-03-07 16:42 UTC (permalink / raw)
To: Mateusz Guzik
Cc: brauner, jack, linux-kernel, linux-fsdevel, io-uring, audit,
axboe
On Fri, Mar 07, 2025 at 05:11:55PM +0100, Mateusz Guzik wrote:
> Atomics are only needed for a combination of io_uring and audit.
>
> Regular file access (even with audit) gets around fine without them.
>
> With this patch 'struct filename' starts with being refcounted using
> regular ops.
>
> In order to avoid API explosion in the getname*() family, a dedicated
> routine is added to switch the obj to use atomics.
>
> This leaves the room for merely issuing getname(), not issuing the
> switch and still trying to manipulate the refcount from another thread.
>
> Catching such cases is facilitated by CONFIG_DEBUG_VFS-dependent
> tracking of who created the given filename object and having refname()
> and putname() detect if another thread is trying to modify them.
Not a good way to handle that, IMO.
Atomics do hurt there, but they are only plastering over the real
problem - names formed in one thread, inserted into audit context
there and operation involving them happening in a different thread.
Refcounting avoids an instant memory corruption, but the real PITA
is in audit users of that stuff.
IMO we should *NOT* grab an audit names slot at getname() time -
that ought to be done explicitly at later points.
The obstacle is that currently there still are several retry loop
with getname() done in it; I've most of that dealt with, need to
finish that series.
And yes, refcount becomes non-atomic as the result.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] fs: support filename refcount without atomics
2025-03-07 16:42 ` Al Viro
@ 2025-03-07 16:44 ` Mateusz Guzik
2025-03-07 22:58 ` Mateusz Guzik
0 siblings, 1 reply; 14+ messages in thread
From: Mateusz Guzik @ 2025-03-07 16:44 UTC (permalink / raw)
To: Al Viro; +Cc: brauner, jack, linux-kernel, linux-fsdevel, io-uring, audit,
axboe
On Fri, Mar 7, 2025 at 5:42 PM Al Viro <[email protected]> wrote:
>
> On Fri, Mar 07, 2025 at 05:11:55PM +0100, Mateusz Guzik wrote:
> > Atomics are only needed for a combination of io_uring and audit.
> >
> > Regular file access (even with audit) gets around fine without them.
> >
> > With this patch 'struct filename' starts with being refcounted using
> > regular ops.
> >
> > In order to avoid API explosion in the getname*() family, a dedicated
> > routine is added to switch the obj to use atomics.
> >
> > This leaves the room for merely issuing getname(), not issuing the
> > switch and still trying to manipulate the refcount from another thread.
> >
> > Catching such cases is facilitated by CONFIG_DEBUG_VFS-dependent
> > tracking of who created the given filename object and having refname()
> > and putname() detect if another thread is trying to modify them.
>
> Not a good way to handle that, IMO.
>
> Atomics do hurt there, but they are only plastering over the real
> problem - names formed in one thread, inserted into audit context
> there and operation involving them happening in a different thread.
>
> Refcounting avoids an instant memory corruption, but the real PITA
> is in audit users of that stuff.
>
> IMO we should *NOT* grab an audit names slot at getname() time -
> that ought to be done explicitly at later points.
>
> The obstacle is that currently there still are several retry loop
> with getname() done in it; I've most of that dealt with, need to
> finish that series.
>
> And yes, refcount becomes non-atomic as the result.
Well yes, it was audit which caused the appearance of atomics in the
first place. I was looking for an easy way out.
If you have something which gets rid of the underlying problem and it
is going to land in the foreseeable future, I wont be defending this
approach.
--
Mateusz Guzik <mjguzik gmail.com>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] fs: support filename refcount without atomics
2025-03-07 16:44 ` Mateusz Guzik
@ 2025-03-07 22:58 ` Mateusz Guzik
0 siblings, 0 replies; 14+ messages in thread
From: Mateusz Guzik @ 2025-03-07 22:58 UTC (permalink / raw)
To: Al Viro; +Cc: brauner, jack, linux-kernel, linux-fsdevel, io-uring, audit,
axboe
On Fri, Mar 7, 2025 at 5:44 PM Mateusz Guzik <[email protected]> wrote:
>
> On Fri, Mar 7, 2025 at 5:42 PM Al Viro <[email protected]> wrote:
> > Not a good way to handle that, IMO.
> >
> > Atomics do hurt there, but they are only plastering over the real
> > problem - names formed in one thread, inserted into audit context
> > there and operation involving them happening in a different thread.
> >
> > Refcounting avoids an instant memory corruption, but the real PITA
> > is in audit users of that stuff.
> >
> > IMO we should *NOT* grab an audit names slot at getname() time -
> > that ought to be done explicitly at later points.
> >
I was looking at doing that, but the code is kind of a mess and I bailed.
> > The obstacle is that currently there still are several retry loop
> > with getname() done in it; I've most of that dealt with, need to
> > finish that series.
> >
> > And yes, refcount becomes non-atomic as the result.
>
> Well yes, it was audit which caused the appearance of atomics in the
> first place. I was looking for an easy way out.
>
> If you have something which gets rid of the underlying problem and it
> is going to land in the foreseeable future, I wont be defending this
> approach.
>
It is unclear to me if you are NAKing the patch, or merely pointing
out this can be done in a better way (which I agree with)
Some time ago I posted a much simpler patch to merely dodge the last
decrement [1], which already accomplishes what I was looking for.
Christian did not like it and wanted something which only deals with
atomics when audit is enabled.
I should have done that patch slightly differently, but bottom line is
the following in putname():
refcnt = atomic_read(&name->refcnt);
if (refcnt != 1) {
if (WARN_ON_ONCE(!refcnt))
return;
if (!atomic_dec_and_test(&name->refcnt))
return;
}
So if you are NAKing the regular -> atomic switch patch, how about the
above as a quick hack until the issue gets resolved? It is trivial to
reason about (refcnt == 1 means nobody can do anything) and guarantees
to dodge one atomic (which in case of no audit means all consumers). I
can repost touched up if you are OK with it (the original posting
issues atomic_read twice).
As for the bigger patch posted here, Jens wants the io_uring bits done
differently and offered to handle them in the upcoming week. I think a
clear statement if the patch is a no-go would be appreciated.
Link 1: https://lore.kernel.org/linux-fsdevel/[email protected]/
--
Mateusz Guzik <mjguzik gmail.com>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] fs: support filename refcount without atomics
2025-03-07 16:11 [PATCH] fs: support filename refcount without atomics Mateusz Guzik
` (2 preceding siblings ...)
2025-03-07 16:42 ` Al Viro
@ 2025-03-08 13:35 ` kernel test robot
2025-03-08 13:46 ` kernel test robot
4 siblings, 0 replies; 14+ messages in thread
From: kernel test robot @ 2025-03-08 13:35 UTC (permalink / raw)
To: Mateusz Guzik, brauner
Cc: llvm, oe-kbuild-all, viro, jack, linux-kernel, linux-fsdevel,
io-uring, audit, axboe, Mateusz Guzik
Hi Mateusz,
kernel test robot noticed the following build errors:
[auto build test ERROR on brauner-vfs/vfs.all]
[also build test ERROR on linus/master v6.14-rc5 next-20250307]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Mateusz-Guzik/fs-support-filename-refcount-without-atomics/20250308-002442
base: https://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git vfs.all
patch link: https://lore.kernel.org/r/20250307161155.760949-1-mjguzik%40gmail.com
patch subject: [PATCH] fs: support filename refcount without atomics
config: i386-buildonly-randconfig-003-20250308 (https://download.01.org/0day-ci/archive/20250308/[email protected]/config)
compiler: clang version 19.1.7 (https://github.com/llvm/llvm-project cd708029e0b2869e80abe31ddb175f7c35361f90)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250308/[email protected]/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <[email protected]>
| Closes: https://lore.kernel.org/oe-kbuild-all/[email protected]/
All errors (new ones prefixed by >>):
In file included from arch/x86/kernel/asm-offsets.c:14:
In file included from include/linux/suspend.h:5:
In file included from include/linux/swap.h:9:
In file included from include/linux/memcontrol.h:13:
In file included from include/linux/cgroup.h:17:
>> include/linux/fs.h:2875:19: error: no member named 'owner' in 'struct filename'
2875 | VFS_BUG_ON(name->owner != current && !name->is_atomic);
| ~~~~ ^
include/linux/vfsdebug.h:35:47: note: expanded from macro 'VFS_BUG_ON'
35 | #define VFS_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond)
| ^~~~
include/linux/build_bug.h:30:63: note: expanded from macro 'BUILD_BUG_ON_INVALID'
30 | #define BUILD_BUG_ON_INVALID(e) ((void)(sizeof((__force long)(e))))
| ^
In file included from arch/x86/kernel/asm-offsets.c:14:
In file included from include/linux/suspend.h:5:
In file included from include/linux/swap.h:9:
In file included from include/linux/memcontrol.h:13:
In file included from include/linux/cgroup.h:17:
include/linux/fs.h:2884:19: error: no member named 'owner' in 'struct filename'
2884 | VFS_BUG_ON(name->owner != current && !name->is_atomic);
| ~~~~ ^
include/linux/vfsdebug.h:35:47: note: expanded from macro 'VFS_BUG_ON'
35 | #define VFS_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond)
| ^~~~
include/linux/build_bug.h:30:63: note: expanded from macro 'BUILD_BUG_ON_INVALID'
30 | #define BUILD_BUG_ON_INVALID(e) ((void)(sizeof((__force long)(e))))
| ^
2 errors generated.
make[3]: *** [scripts/Makefile.build:102: arch/x86/kernel/asm-offsets.s] Error 1 shuffle=2759713076
make[3]: Target 'prepare' not remade because of errors.
make[2]: *** [Makefile:1264: prepare0] Error 2 shuffle=2759713076
make[2]: Target 'prepare' not remade because of errors.
make[1]: *** [Makefile:251: __sub-make] Error 2 shuffle=2759713076
make[1]: Target 'prepare' not remade because of errors.
make: *** [Makefile:251: __sub-make] Error 2 shuffle=2759713076
make: Target 'prepare' not remade because of errors.
vim +2875 include/linux/fs.h
2866
2867 static inline void makeatomicname(struct filename *name)
2868 {
2869 VFS_BUG_ON(IS_ERR_OR_NULL(name));
2870 /*
2871 * The name can legitimately already be atomic if it was cached by audit.
2872 * If switching the refcount to atomic, we need not to know we are the
2873 * only non-atomic user.
2874 */
> 2875 VFS_BUG_ON(name->owner != current && !name->is_atomic);
2876 /*
2877 * Don't bother branching, this is a store to an already dirtied cacheline.
2878 */
2879 name->is_atomic = true;
2880 }
2881
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] fs: support filename refcount without atomics
2025-03-07 16:11 [PATCH] fs: support filename refcount without atomics Mateusz Guzik
` (3 preceding siblings ...)
2025-03-08 13:35 ` kernel test robot
@ 2025-03-08 13:46 ` kernel test robot
4 siblings, 0 replies; 14+ messages in thread
From: kernel test robot @ 2025-03-08 13:46 UTC (permalink / raw)
To: Mateusz Guzik, brauner
Cc: oe-kbuild-all, viro, jack, linux-kernel, linux-fsdevel, io-uring,
audit, axboe, Mateusz Guzik
Hi Mateusz,
kernel test robot noticed the following build errors:
[auto build test ERROR on brauner-vfs/vfs.all]
[also build test ERROR on linus/master v6.14-rc5 next-20250307]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Mateusz-Guzik/fs-support-filename-refcount-without-atomics/20250308-002442
base: https://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git vfs.all
patch link: https://lore.kernel.org/r/20250307161155.760949-1-mjguzik%40gmail.com
patch subject: [PATCH] fs: support filename refcount without atomics
config: i386-buildonly-randconfig-001-20250308 (https://download.01.org/0day-ci/archive/20250308/[email protected]/config)
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250308/[email protected]/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <[email protected]>
| Closes: https://lore.kernel.org/oe-kbuild-all/[email protected]/
All errors (new ones prefixed by >>):
In file included from include/linux/container_of.h:5,
from include/linux/list.h:5,
from include/linux/swait.h:5,
from include/linux/completion.h:12,
from include/linux/crypto.h:15,
from arch/x86/kernel/asm-offsets.c:9:
include/linux/fs.h: In function 'makeatomicname':
>> include/linux/fs.h:2875:24: error: 'struct filename' has no member named 'owner'
2875 | VFS_BUG_ON(name->owner != current && !name->is_atomic);
| ^~
include/linux/build_bug.h:30:63: note: in definition of macro 'BUILD_BUG_ON_INVALID'
30 | #define BUILD_BUG_ON_INVALID(e) ((void)(sizeof((__force long)(e))))
| ^
include/linux/fs.h:2875:9: note: in expansion of macro 'VFS_BUG_ON'
2875 | VFS_BUG_ON(name->owner != current && !name->is_atomic);
| ^~~~~~~~~~
include/linux/fs.h: In function 'refname':
include/linux/fs.h:2884:24: error: 'struct filename' has no member named 'owner'
2884 | VFS_BUG_ON(name->owner != current && !name->is_atomic);
| ^~
include/linux/build_bug.h:30:63: note: in definition of macro 'BUILD_BUG_ON_INVALID'
30 | #define BUILD_BUG_ON_INVALID(e) ((void)(sizeof((__force long)(e))))
| ^
include/linux/fs.h:2884:9: note: in expansion of macro 'VFS_BUG_ON'
2884 | VFS_BUG_ON(name->owner != current && !name->is_atomic);
| ^~~~~~~~~~
make[3]: *** [scripts/Makefile.build:102: arch/x86/kernel/asm-offsets.s] Error 1 shuffle=2878351160
make[3]: Target 'prepare' not remade because of errors.
make[2]: *** [Makefile:1264: prepare0] Error 2 shuffle=2878351160
make[2]: Target 'prepare' not remade because of errors.
make[1]: *** [Makefile:251: __sub-make] Error 2 shuffle=2878351160
make[1]: Target 'prepare' not remade because of errors.
make: *** [Makefile:251: __sub-make] Error 2 shuffle=2878351160
make: Target 'prepare' not remade because of errors.
vim +2875 include/linux/fs.h
2866
2867 static inline void makeatomicname(struct filename *name)
2868 {
2869 VFS_BUG_ON(IS_ERR_OR_NULL(name));
2870 /*
2871 * The name can legitimately already be atomic if it was cached by audit.
2872 * If switching the refcount to atomic, we need not to know we are the
2873 * only non-atomic user.
2874 */
> 2875 VFS_BUG_ON(name->owner != current && !name->is_atomic);
2876 /*
2877 * Don't bother branching, this is a store to an already dirtied cacheline.
2878 */
2879 name->is_atomic = true;
2880 }
2881
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 14+ messages in thread
end of thread, other threads:[~2025-03-08 13:47 UTC | newest]
Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-03-07 16:11 [PATCH] fs: support filename refcount without atomics Mateusz Guzik
2025-03-07 16:18 ` Jens Axboe
2025-03-07 16:25 ` Mateusz Guzik
2025-03-07 16:32 ` Jens Axboe
2025-03-07 16:35 ` Mateusz Guzik
2025-03-07 16:38 ` Jens Axboe
2025-03-07 16:39 ` Mateusz Guzik
2025-03-07 16:26 ` Matthew Wilcox
2025-03-07 16:32 ` Mateusz Guzik
2025-03-07 16:42 ` Al Viro
2025-03-07 16:44 ` Mateusz Guzik
2025-03-07 22:58 ` Mateusz Guzik
2025-03-08 13:35 ` kernel test robot
2025-03-08 13:46 ` kernel test robot
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox