* [PATCH v7 1/3] fs: add offset parameter to iterate_dir function
2021-12-21 16:40 [PATCH v7 0/3] io_uring: add getdents64 support Stefan Roesch
@ 2021-12-21 16:40 ` Stefan Roesch
2021-12-21 16:40 ` [PATCH v7 2/3] fs: split off vfs_getdents function of getdents64 syscall Stefan Roesch
` (5 subsequent siblings)
6 siblings, 0 replies; 14+ messages in thread
From: Stefan Roesch @ 2021-12-21 16:40 UTC (permalink / raw)
To: io-uring, linux-fsdevel; +Cc: torvalds, shr
This change adds the offset parameter to the iterate_dir
function. The offset paramater allows the caller to specify
the offset position.
This change is required to support getdents in io_uring.
Signed-off-by: Stefan Roesch <[email protected]>
---
arch/alpha/kernel/osf_sys.c | 2 +-
fs/ecryptfs/file.c | 2 +-
fs/exportfs/expfs.c | 2 +-
fs/ksmbd/smb2pdu.c | 3 ++-
fs/ksmbd/vfs.c | 4 ++--
fs/nfsd/nfs4recover.c | 2 +-
fs/nfsd/vfs.c | 2 +-
fs/overlayfs/readdir.c | 6 +++---
fs/readdir.c | 28 ++++++++++++++++++----------
include/linux/fs.h | 2 +-
10 files changed, 31 insertions(+), 22 deletions(-)
diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index 8bbeebb73cf0..cf68c459bca6 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -162,7 +162,7 @@ SYSCALL_DEFINE4(osf_getdirentries, unsigned int, fd,
if (!arg.file)
return -EBADF;
- error = iterate_dir(arg.file, &buf.ctx);
+ error = iterate_dir(arg.file, &buf.ctx, &arg.file->f_pos);
if (error >= 0)
error = buf.error;
if (count != buf.count)
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 18d5b91cb573..b68f1945e615 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -109,7 +109,7 @@ static int ecryptfs_readdir(struct file *file, struct dir_context *ctx)
.sb = inode->i_sb,
};
lower_file = ecryptfs_file_to_lower(file);
- rc = iterate_dir(lower_file, &buf.ctx);
+ rc = iterate_dir(lower_file, &buf.ctx, &lower_file->f_pos);
ctx->pos = buf.ctx.pos;
if (rc < 0)
goto out;
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 0106eba46d5a..654e2d4b1d4f 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -323,7 +323,7 @@ static int get_name(const struct path *path, char *name, struct dentry *child)
while (1) {
int old_seq = buffer.sequence;
- error = iterate_dir(file, &buffer.ctx);
+ error = iterate_dir(file, &buffer.ctx, &file->f_pos);
if (buffer.found) {
error = 0;
break;
diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c
index 49c9da37315c..fd4cb995d06d 100644
--- a/fs/ksmbd/smb2pdu.c
+++ b/fs/ksmbd/smb2pdu.c
@@ -3925,7 +3925,8 @@ int smb2_query_dir(struct ksmbd_work *work)
dir_fp->readdir_data.private = &query_dir_private;
set_ctx_actor(&dir_fp->readdir_data.ctx, __query_dir);
- rc = iterate_dir(dir_fp->filp, &dir_fp->readdir_data.ctx);
+ rc = iterate_dir(dir_fp->filp, &dir_fp->readdir_data.ctx,
+ &dir_fp->filp->f_pos);
if (rc == 0)
restart_ctx(&dir_fp->readdir_data.ctx);
if (rc == -ENOSPC)
diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c
index 19d36393974c..5b8e23d3c846 100644
--- a/fs/ksmbd/vfs.c
+++ b/fs/ksmbd/vfs.c
@@ -1136,7 +1136,7 @@ int ksmbd_vfs_empty_dir(struct ksmbd_file *fp)
set_ctx_actor(&readdir_data.ctx, __dir_empty);
readdir_data.dirent_count = 0;
- err = iterate_dir(fp->filp, &readdir_data.ctx);
+ err = iterate_dir(fp->filp, &readdir_data.ctx, &fp->filp->f_pos);
if (readdir_data.dirent_count > 2)
err = -ENOTEMPTY;
else
@@ -1186,7 +1186,7 @@ static int ksmbd_vfs_lookup_in_dir(struct path *dir, char *name, size_t namelen)
if (IS_ERR(dfilp))
return PTR_ERR(dfilp);
- ret = iterate_dir(dfilp, &readdir_data.ctx);
+ ret = iterate_dir(dfilp, &readdir_data.ctx, &dfilp->f_pos);
if (readdir_data.dirent_count > 0)
ret = 0;
fput(dfilp);
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 6fedc49726bf..79a2799891e4 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -307,7 +307,7 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn)
return status;
}
- status = iterate_dir(nn->rec_file, &ctx.ctx);
+ status = iterate_dir(nn->rec_file, &ctx.ctx, &nn->rec_file->f_pos);
inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
list_for_each_entry_safe(entry, tmp, &ctx.names, list) {
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index c99857689e2c..085864c25318 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1980,7 +1980,7 @@ static __be32 nfsd_buffered_readdir(struct file *file, struct svc_fh *fhp,
buf.used = 0;
buf.full = 0;
- host_err = iterate_dir(file, &buf.ctx);
+ host_err = iterate_dir(file, &buf.ctx, &file->f_pos);
if (buf.full)
host_err = 0;
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 150fdf3bc68d..52167ff9e513 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -306,7 +306,7 @@ static inline int ovl_dir_read(struct path *realpath,
do {
rdd->count = 0;
rdd->err = 0;
- err = iterate_dir(realfile, &rdd->ctx);
+ err = iterate_dir(realfile, &rdd->ctx, &realfile->f_pos);
if (err >= 0)
err = rdd->err;
} while (!err && rdd->count);
@@ -722,7 +722,7 @@ static int ovl_iterate_real(struct file *file, struct dir_context *ctx)
return PTR_ERR(rdt.cache);
}
- err = iterate_dir(od->realfile, &rdt.ctx);
+ err = iterate_dir(od->realfile, &rdt.ctx, &od->realfile->f_pos);
ctx->pos = rdt.ctx.pos;
return err;
@@ -753,7 +753,7 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx)
OVL_TYPE_MERGE(ovl_path_type(dentry->d_parent))))) {
err = ovl_iterate_real(file, ctx);
} else {
- err = iterate_dir(od->realfile, ctx);
+ err = iterate_dir(od->realfile, ctx, &od->realfile->f_pos);
}
goto out;
}
diff --git a/fs/readdir.c b/fs/readdir.c
index 09e8ed7d4161..c1e6612e0f47 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -36,8 +36,13 @@
unsafe_copy_to_user(dst, src, len, label); \
} while (0)
-
-int iterate_dir(struct file *file, struct dir_context *ctx)
+/**
+ * iterate_dir - iterate over directory
+ * @file : pointer to file struct of directory
+ * @ctx : pointer to directory ctx structure
+ * @pos : file offset
+ */
+int iterate_dir(struct file *file, struct dir_context *ctx, loff_t *pos)
{
struct inode *inode = file_inode(file);
bool shared = false;
@@ -60,12 +65,15 @@ int iterate_dir(struct file *file, struct dir_context *ctx)
res = -ENOENT;
if (!IS_DEADDIR(inode)) {
- ctx->pos = file->f_pos;
+ ctx->pos = *pos;
+
if (shared)
res = file->f_op->iterate_shared(file, ctx);
else
res = file->f_op->iterate(file, ctx);
- file->f_pos = ctx->pos;
+
+ *pos = ctx->pos;
+
fsnotify_access(file);
file_accessed(file);
}
@@ -190,7 +198,7 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
if (!f.file)
return -EBADF;
- error = iterate_dir(f.file, &buf.ctx);
+ error = iterate_dir(f.file, &buf.ctx, &f.file->f_pos);
if (buf.result)
error = buf.result;
@@ -283,7 +291,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
if (!f.file)
return -EBADF;
- error = iterate_dir(f.file, &buf.ctx);
+ error = iterate_dir(f.file, &buf.ctx, &f.file->f_pos);
if (error >= 0)
error = buf.error;
if (buf.prev_reclen) {
@@ -366,7 +374,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
if (!f.file)
return -EBADF;
- error = iterate_dir(f.file, &buf.ctx);
+ error = iterate_dir(f.file, &buf.ctx, &f.file->f_pos);
if (error >= 0)
error = buf.error;
if (buf.prev_reclen) {
@@ -379,7 +387,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
else
error = count - buf.count;
}
- fdput_pos(f);
+
return error;
}
@@ -448,7 +456,7 @@ COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
if (!f.file)
return -EBADF;
- error = iterate_dir(f.file, &buf.ctx);
+ error = iterate_dir(f.file, &buf.ctx, &f.file->f_pos);
if (buf.result)
error = buf.result;
@@ -534,7 +542,7 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
if (!f.file)
return -EBADF;
- error = iterate_dir(f.file, &buf.ctx);
+ error = iterate_dir(f.file, &buf.ctx, &f.file->f_pos);
if (error >= 0)
error = buf.error;
if (buf.prev_reclen) {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6b8dc1a78df6..e1becbe86b07 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3340,7 +3340,7 @@ const char *simple_get_link(struct dentry *, struct inode *,
struct delayed_call *);
extern const struct inode_operations simple_symlink_inode_operations;
-extern int iterate_dir(struct file *, struct dir_context *);
+extern int iterate_dir(struct file *, struct dir_context *, loff_t *pos);
int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
int flags);
--
2.30.2
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH v7 2/3] fs: split off vfs_getdents function of getdents64 syscall
2021-12-21 16:40 [PATCH v7 0/3] io_uring: add getdents64 support Stefan Roesch
2021-12-21 16:40 ` [PATCH v7 1/3] fs: add offset parameter to iterate_dir function Stefan Roesch
@ 2021-12-21 16:40 ` Stefan Roesch
2021-12-21 16:40 ` [PATCH v7 3/3] io_uring: add support for getdents64 Stefan Roesch
` (4 subsequent siblings)
6 siblings, 0 replies; 14+ messages in thread
From: Stefan Roesch @ 2021-12-21 16:40 UTC (permalink / raw)
To: io-uring, linux-fsdevel; +Cc: torvalds, shr, Christian Brauner
This splits off the vfs_getdents function from the getdents64 system
call. This allows io_uring to call the vfs_getdents function.
Signed-off-by: Stefan Roesch <[email protected]>
Acked-by: Christian Brauner <[email protected]>
---
fs/internal.h | 8 ++++++++
fs/readdir.c | 36 ++++++++++++++++++++++++++++--------
2 files changed, 36 insertions(+), 8 deletions(-)
diff --git a/fs/internal.h b/fs/internal.h
index 7979ff8d168c..432ea3ce76ec 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -194,3 +194,11 @@ long splice_file_to_pipe(struct file *in,
struct pipe_inode_info *opipe,
loff_t *offset,
size_t len, unsigned int flags);
+
+/*
+ * fs/readdir.c
+ */
+struct linux_dirent64;
+
+int vfs_getdents(struct file *file, struct linux_dirent64 __user *dirent,
+ unsigned int count, loff_t *pos);
diff --git a/fs/readdir.c b/fs/readdir.c
index c1e6612e0f47..1b1fade87525 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -21,6 +21,7 @@
#include <linux/unistd.h>
#include <linux/compat.h>
#include <linux/uaccess.h>
+#include "internal.h"
#include <asm/unaligned.h>
@@ -359,22 +360,25 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen,
return -EFAULT;
}
-SYSCALL_DEFINE3(getdents64, unsigned int, fd,
- struct linux_dirent64 __user *, dirent, unsigned int, count)
+/**
+ * vfs_getdents - getdents without fdget
+ * @file : pointer to file struct of directory
+ * @dirent : pointer to user directory structure
+ * @count : size of buffer
+ * @pos : file pos
+ */
+int vfs_getdents(struct file *file, struct linux_dirent64 __user *dirent,
+ unsigned int count, loff_t *pos)
{
- struct fd f;
struct getdents_callback64 buf = {
.ctx.actor = filldir64,
+ .ctx.pos = *pos,
.count = count,
.current_dir = dirent
};
int error;
- f = fdget_pos(fd);
- if (!f.file)
- return -EBADF;
-
- error = iterate_dir(f.file, &buf.ctx, &f.file->f_pos);
+ error = iterate_dir(file, &buf.ctx, pos);
if (error >= 0)
error = buf.error;
if (buf.prev_reclen) {
@@ -391,6 +395,22 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
return error;
}
+SYSCALL_DEFINE3(getdents64, unsigned int, fd,
+ struct linux_dirent64 __user *, dirent, unsigned int, count)
+{
+ struct fd f;
+ int error;
+
+ f = fdget_pos(fd);
+ if (!f.file)
+ return -EBADF;
+
+ error = vfs_getdents(f.file, dirent, count, &f.file->f_pos);
+
+ fdput_pos(f);
+ return error;
+ }
+
#ifdef CONFIG_COMPAT
struct compat_old_linux_dirent {
compat_ulong_t d_ino;
--
2.30.2
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH v7 3/3] io_uring: add support for getdents64
2021-12-21 16:40 [PATCH v7 0/3] io_uring: add getdents64 support Stefan Roesch
2021-12-21 16:40 ` [PATCH v7 1/3] fs: add offset parameter to iterate_dir function Stefan Roesch
2021-12-21 16:40 ` [PATCH v7 2/3] fs: split off vfs_getdents function of getdents64 syscall Stefan Roesch
@ 2021-12-21 16:40 ` Stefan Roesch
2021-12-21 17:15 ` [PATCH v7 0/3] io_uring: add getdents64 support Linus Torvalds
` (3 subsequent siblings)
6 siblings, 0 replies; 14+ messages in thread
From: Stefan Roesch @ 2021-12-21 16:40 UTC (permalink / raw)
To: io-uring, linux-fsdevel; +Cc: torvalds, shr, Pavel Begunkov
This adds support for getdents64 to io_uring.
Signed-off-by: Stefan Roesch <[email protected]>
Reviewed-by: Pavel Begunkov <[email protected]>
---
fs/io_uring.c | 52 +++++++++++++++++++++++++++++++++++
include/uapi/linux/io_uring.h | 1 +
2 files changed, 53 insertions(+)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5092dfe56da6..c8258c784116 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -693,6 +693,13 @@ struct io_hardlink {
int flags;
};
+struct io_getdents {
+ struct file *file;
+ struct linux_dirent64 __user *dirent;
+ unsigned int count;
+ loff_t pos;
+};
+
struct io_async_connect {
struct sockaddr_storage address;
};
@@ -858,6 +865,7 @@ struct io_kiocb {
struct io_mkdir mkdir;
struct io_symlink symlink;
struct io_hardlink hardlink;
+ struct io_getdents getdents;
};
u8 opcode;
@@ -1107,6 +1115,9 @@ static const struct io_op_def io_op_defs[] = {
[IORING_OP_MKDIRAT] = {},
[IORING_OP_SYMLINKAT] = {},
[IORING_OP_LINKAT] = {},
+ [IORING_OP_GETDENTS] = {
+ .needs_file = 1,
+ },
};
/* requests with any of those set should undergo io_disarm_next() */
@@ -4068,6 +4079,42 @@ static int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
return 0;
}
+static int io_getdents_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_getdents *getdents = &req->getdents;
+
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
+ if (sqe->ioprio || sqe->rw_flags || sqe->buf_index)
+ return -EINVAL;
+
+ getdents->pos = READ_ONCE(sqe->off);
+ getdents->dirent = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ getdents->count = READ_ONCE(sqe->len);
+
+ return 0;
+}
+
+static int io_getdents(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_getdents *getdents = &req->getdents;
+ int ret;
+
+ if (issue_flags & IO_URING_F_NONBLOCK)
+ return -EAGAIN;
+
+ ret = vfs_getdents(req->file, getdents->dirent, getdents->count, &getdents->pos);
+ if (ret < 0) {
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
+
+ req_set_fail(req);
+ }
+
+ io_req_complete(req, ret);
+ return 0;
+}
+
static int io_shutdown_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
@@ -6574,6 +6621,8 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return io_symlinkat_prep(req, sqe);
case IORING_OP_LINKAT:
return io_linkat_prep(req, sqe);
+ case IORING_OP_GETDENTS:
+ return io_getdents_prep(req, sqe);
}
printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
@@ -6857,6 +6906,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
case IORING_OP_LINKAT:
ret = io_linkat(req, issue_flags);
break;
+ case IORING_OP_GETDENTS:
+ ret = io_getdents(req, issue_flags);
+ break;
default:
ret = -EINVAL;
break;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 787f491f0d2a..57dc88db5793 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -143,6 +143,7 @@ enum {
IORING_OP_MKDIRAT,
IORING_OP_SYMLINKAT,
IORING_OP_LINKAT,
+ IORING_OP_GETDENTS,
/* this goes last, obviously */
IORING_OP_LAST,
--
2.30.2
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH v7 0/3] io_uring: add getdents64 support
2021-12-21 16:40 [PATCH v7 0/3] io_uring: add getdents64 support Stefan Roesch
` (2 preceding siblings ...)
2021-12-21 16:40 ` [PATCH v7 3/3] io_uring: add support for getdents64 Stefan Roesch
@ 2021-12-21 17:15 ` Linus Torvalds
2021-12-31 23:15 ` Al Viro
2021-12-21 19:17 ` Jens Axboe
` (2 subsequent siblings)
6 siblings, 1 reply; 14+ messages in thread
From: Linus Torvalds @ 2021-12-21 17:15 UTC (permalink / raw)
To: Stefan Roesch; +Cc: io-uring, linux-fsdevel
On Tue, Dec 21, 2021 at 8:40 AM Stefan Roesch <[email protected]> wrote:
>
> This series adds support for getdents64 in liburing. The intent is to
> provide a more complete I/O interface for io_uring.
Ack, this series looks much more natural to me now.
I think some of the callers of "iterate_dir()" could probably be
cleaned up with the added argument, but for this series I prefer that
mindless approach of just doing "&(arg1)->f_pos" as the third argument
that is clearly a no-op.
So the end result is perhaps not as beautiful as it could be, but I
think the patch series DTRT.
Linus
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v7 0/3] io_uring: add getdents64 support
2021-12-21 17:15 ` [PATCH v7 0/3] io_uring: add getdents64 support Linus Torvalds
@ 2021-12-31 23:15 ` Al Viro
2022-01-01 19:59 ` Al Viro
0 siblings, 1 reply; 14+ messages in thread
From: Al Viro @ 2021-12-31 23:15 UTC (permalink / raw)
To: Linus Torvalds; +Cc: Stefan Roesch, io-uring, linux-fsdevel
On Tue, Dec 21, 2021 at 09:15:24AM -0800, Linus Torvalds wrote:
> On Tue, Dec 21, 2021 at 8:40 AM Stefan Roesch <[email protected]> wrote:
> >
> > This series adds support for getdents64 in liburing. The intent is to
> > provide a more complete I/O interface for io_uring.
>
> Ack, this series looks much more natural to me now.
>
> I think some of the callers of "iterate_dir()" could probably be
> cleaned up with the added argument, but for this series I prefer that
> mindless approach of just doing "&(arg1)->f_pos" as the third argument
> that is clearly a no-op.
>
> So the end result is perhaps not as beautiful as it could be, but I
> think the patch series DTRT.
It really does not. Think what happens if you pass e.g. an odd position
to that on e.g. ext2/3/4. Or just use it on tmpfs, for that matter.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v7 0/3] io_uring: add getdents64 support
2021-12-31 23:15 ` Al Viro
@ 2022-01-01 19:59 ` Al Viro
2022-01-03 7:03 ` Jann Horn
0 siblings, 1 reply; 14+ messages in thread
From: Al Viro @ 2022-01-01 19:59 UTC (permalink / raw)
To: Linus Torvalds; +Cc: Stefan Roesch, io-uring, linux-fsdevel
On Fri, Dec 31, 2021 at 11:15:55PM +0000, Al Viro wrote:
> On Tue, Dec 21, 2021 at 09:15:24AM -0800, Linus Torvalds wrote:
> > On Tue, Dec 21, 2021 at 8:40 AM Stefan Roesch <[email protected]> wrote:
> > >
> > > This series adds support for getdents64 in liburing. The intent is to
> > > provide a more complete I/O interface for io_uring.
> >
> > Ack, this series looks much more natural to me now.
> >
> > I think some of the callers of "iterate_dir()" could probably be
> > cleaned up with the added argument, but for this series I prefer that
> > mindless approach of just doing "&(arg1)->f_pos" as the third argument
> > that is clearly a no-op.
> >
> > So the end result is perhaps not as beautiful as it could be, but I
> > think the patch series DTRT.
>
> It really does not. Think what happens if you pass e.g. an odd position
> to that on e.g. ext2/3/4. Or just use it on tmpfs, for that matter.
[A bit of a braindump follows; my apologies for reminding of some
unpleasant parts of history]
The real problem here is the userland ABI along the lines of pread for
directories. There's a good reason why we (as well as everybody else,
AFAIK) do not have pgetdents(2).
Handling of IO positions for directories had been causing trouble
ever since the directory layouts had grown support for long names.
Originally a directory had been an array of fixed-sized entries; back
then ls(1) simply used read(2). No special logics for handling offsets,
other than "each entry is 16 bytes, so you want to read a multiple of
16 starting from offset that is a multiple of 16".
As soon as FFS had introduced support for names longer than 14 characters,
the things got more complicated - there's no predicting if given position
is an entry boundary. Worse, what used to have been an entry boundary
might very well come to point to the middle of a name - all it takes is
unlink()+creat() done since the time the position used to be OK.
Details are filesystem-dependent; e.g. for original FFS all multiples of
512 are valid offsets, and each entry has its length stored in bytes 4
and 5, so one needs to read the relevant 512 bytes of contents and walk
the list of entries until they get to (or past) the position that needs
to be validated. For ext2 it's a bit more unpleasant, since the chunk
size is not 512 bytes - it's a block size, i.e. might easily by 4Kb.
For more complex layouts it gets even nastier.
Having to do that kind of validation on each directory entry access would
be too costly. That's somewhat mitigated since the old readdir(2) is no
longer used, and we return multiple entries per syscall (getdents(2)).
With the exclusion between directory reads and modifications, that allows
to limit validation to the first entry returned on that syscall.
It's still too costly, though. The next part of mitigation is to use
the fact that valid position will remain valid until somebody modifies a
directory, so we don't need to validate if directory hadn't been changed
since the last time getdents(2) had gotten to this position. Of course,
explicit change of position since that last getdents(2) means that we
can't skip validation.
Another fun part is synthetic filesystems like tmpfs - there we don't
have any persistent directory contents we could use as source of offsets.
All we have is a cyclic list of dentries; memory address of dentry is
obviously not a candidate - trying to validate _that_ would be beyond
unpleasant. So we use the position in the list. To avoid the O(directory
size) cost of walking the list to the position we are asked to read from,
we insert a cursor into the list and have directory reads and seeks move
it as needed. File position is not authoritative there - the cursor is.
They can get out of sync - if you read almost to the end of directory,
unlink the first file, use lseek(fd, SEEK_CUR, 0) to find position, then
lseek to 0 and back to the position reported, you'll end up one entry
further than you would without those lseeks. Userland is generally OK
with that (and we are within the POSIX warranties). However, if that
kind of "out of sync" can happen without any directory modifications
involved, we have trouble.
Directories are not well-suited for random access. Not since mid-80s.
It's possible, but not cheap and there's a lot of non-obvious corner cases
where directory modifications are involved. Some of those must work,
or the userland will break. Telling which ones those are is not trivial.
pgetdents(2) would be a bad idea by itself; making it asynchronous is
the stuff of nightmares. Please, reconsider that ABI - AFAICS, there's
no way to do it cheaply and it will take modifying each ->iterate_dir()
just to avoid breaking the existing uses of those.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v7 0/3] io_uring: add getdents64 support
2022-01-01 19:59 ` Al Viro
@ 2022-01-03 7:03 ` Jann Horn
2022-01-03 15:00 ` Jens Axboe
` (2 more replies)
0 siblings, 3 replies; 14+ messages in thread
From: Jann Horn @ 2022-01-03 7:03 UTC (permalink / raw)
To: Al Viro, Jens Axboe, Stefan Roesch
Cc: Linus Torvalds, io-uring, linux-fsdevel, Pavel Begunkov
On Sat, Jan 1, 2022 at 8:59 PM Al Viro <[email protected]> wrote:
> On Fri, Dec 31, 2021 at 11:15:55PM +0000, Al Viro wrote:
> > On Tue, Dec 21, 2021 at 09:15:24AM -0800, Linus Torvalds wrote:
> > > On Tue, Dec 21, 2021 at 8:40 AM Stefan Roesch <[email protected]> wrote:
> > > >
> > > > This series adds support for getdents64 in liburing. The intent is to
> > > > provide a more complete I/O interface for io_uring.
> > >
> > > Ack, this series looks much more natural to me now.
> > >
> > > I think some of the callers of "iterate_dir()" could probably be
> > > cleaned up with the added argument, but for this series I prefer that
> > > mindless approach of just doing "&(arg1)->f_pos" as the third argument
> > > that is clearly a no-op.
> > >
> > > So the end result is perhaps not as beautiful as it could be, but I
> > > think the patch series DTRT.
> >
> > It really does not. Think what happens if you pass e.g. an odd position
> > to that on e.g. ext2/3/4. Or just use it on tmpfs, for that matter.
>
> [A bit of a braindump follows; my apologies for reminding of some
> unpleasant parts of history]
>
> The real problem here is the userland ABI along the lines of pread for
> directories. There's a good reason why we (as well as everybody else,
> AFAIK) do not have pgetdents(2).
>
> Handling of IO positions for directories had been causing trouble
> ever since the directory layouts had grown support for long names.
> Originally a directory had been an array of fixed-sized entries; back
> then ls(1) simply used read(2). No special logics for handling offsets,
> other than "each entry is 16 bytes, so you want to read a multiple of
> 16 starting from offset that is a multiple of 16".
>
> As soon as FFS had introduced support for names longer than 14 characters,
> the things got more complicated - there's no predicting if given position
> is an entry boundary. Worse, what used to have been an entry boundary
> might very well come to point to the middle of a name - all it takes is
> unlink()+creat() done since the time the position used to be OK.
>
> Details are filesystem-dependent; e.g. for original FFS all multiples of
> 512 are valid offsets, and each entry has its length stored in bytes 4
> and 5, so one needs to read the relevant 512 bytes of contents and walk
> the list of entries until they get to (or past) the position that needs
> to be validated. For ext2 it's a bit more unpleasant, since the chunk
> size is not 512 bytes - it's a block size, i.e. might easily by 4Kb.
> For more complex layouts it gets even nastier.
>
> Having to do that kind of validation on each directory entry access would
> be too costly. That's somewhat mitigated since the old readdir(2) is no
> longer used, and we return multiple entries per syscall (getdents(2)).
> With the exclusion between directory reads and modifications, that allows
> to limit validation to the first entry returned on that syscall.
>
> It's still too costly, though. The next part of mitigation is to use
> the fact that valid position will remain valid until somebody modifies a
> directory, so we don't need to validate if directory hadn't been changed
> since the last time getdents(2) had gotten to this position. Of course,
> explicit change of position since that last getdents(2) means that we
> can't skip validation.
And for this validation caching to work properly, AFAIU you need to
hold the file->f_pos_lock (or have exclusive access to the "struct
file"), which happens in the normal getdents() path via fdget_pos().
This guarantees that the readdir handler has exclusive access to the
file's ->f_version, which has to stay in sync with the position.
By the way, this makes me wonder: If I open() an ext4 directory and
then concurrently modify the directory, call readdir() on it, and
issue IORING_OP_READV SQEs with ->off == -1, can that cause ext4 to
think that filesystem corruption has occurred?
io_uring has some dodgy code that seems to be reading and writing
file->f_pos without holding the file->f_pos_lock. And even if the file
doesn't have an f_op->read or f_op->read_iter handler, I think you
might be able to read ->f_pos of an ext4 directory and write the value
back later, unless I'm missing a check somewhere?
io_prep_rw() grabs file->f_pos; then later, io_read() calls
io_iter_do_read() (which will fail with -EINVAL), and then the error
path goes through kiocb_done(), which writes the position back to
req->file->f_pos. So I think the following race might work:
First, open an FD referencing a directory on an ext4 filesystem. Read
part of the directory's entries from the FD with getdents(). Then
modify the directory such that the FD's ->f_version is now stale. Then
do the race:
task A: start a IORING_OP_READV op on the FD and let it grab the stale
offset from file->f_pos
task B: run getdents() on the FD. after this, ->f_version is
up-to-date and ->f_pos points to the start of a dentry
task A: continue handling the IORING_OP_READV: io_iter_do_read()
fails, kiocb_done() overwrites the valid ->f_pos with the stale value
while leaving ->f_version up-to-date
Afterwards, the file might have an ->f_pos pointing in the middle of a
dentry while ->f_version indicates that it is valid (meaning it's
supposed to point to the start of a dentry). If you then do another
getdents() call on the FD, I think you might get garbage back and/or
make the kernel think that the filesystem is corrupted (depending on
what's stored at that offset)?
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v7 0/3] io_uring: add getdents64 support
2022-01-03 7:03 ` Jann Horn
@ 2022-01-03 15:00 ` Jens Axboe
2022-01-03 18:55 ` Linus Torvalds
2022-01-03 21:12 ` Al Viro
2 siblings, 0 replies; 14+ messages in thread
From: Jens Axboe @ 2022-01-03 15:00 UTC (permalink / raw)
To: Jann Horn, Al Viro, Stefan Roesch
Cc: Linus Torvalds, io-uring, linux-fsdevel, Pavel Begunkov
On 1/2/22 11:03 PM, Jann Horn wrote:
> io_uring has some dodgy code that seems to be reading and writing
> file->f_pos without holding the file->f_pos_lock. And even if the file
> doesn't have an f_op->read or f_op->read_iter handler, I think you
> might be able to read ->f_pos of an ext4 directory and write the value
> back later, unless I'm missing a check somewhere?
I posted an RFC to hold f_pos_lock across those operations before the
break:
https://lore.kernel.org/io-uring/[email protected]/
picking it up this week and flushing it out, hopefully.
--
Jens Axboe
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v7 0/3] io_uring: add getdents64 support
2022-01-03 7:03 ` Jann Horn
2022-01-03 15:00 ` Jens Axboe
@ 2022-01-03 18:55 ` Linus Torvalds
2022-01-03 21:12 ` Al Viro
2 siblings, 0 replies; 14+ messages in thread
From: Linus Torvalds @ 2022-01-03 18:55 UTC (permalink / raw)
To: Jann Horn
Cc: Al Viro, Jens Axboe, Stefan Roesch, io-uring, linux-fsdevel,
Pavel Begunkov
On Sun, Jan 2, 2022 at 11:04 PM Jann Horn <[email protected]> wrote:
>
> And for this validation caching to work properly, AFAIU you need to
> hold the file->f_pos_lock (or have exclusive access to the "struct
> file"), which happens in the normal getdents() path via fdget_pos().
> This guarantees that the readdir handler has exclusive access to the
> file's ->f_version, which has to stay in sync with the position.
Yes.
So the whole 'preaddir()' model was wrong, and thanks to Al for noticing.
It turns out that you cannot pass in a different 'pos' than f_pos,
because we have that very tight coupling between the 'struct file' and
readdir().
It's not just about f_pos and f_version, either - Al pointed out the
virtual filesystems, which use a special dentry cursor to traverse the
child dentries for readdir, and that one uses 'file->private_data'.
So the directory position isn't really about some simple passed-in
pos, it has locking rules, it has validation, and it has actual
secondary data in the file pointer.
Linus
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v7 0/3] io_uring: add getdents64 support
2022-01-03 7:03 ` Jann Horn
2022-01-03 15:00 ` Jens Axboe
2022-01-03 18:55 ` Linus Torvalds
@ 2022-01-03 21:12 ` Al Viro
2 siblings, 0 replies; 14+ messages in thread
From: Al Viro @ 2022-01-03 21:12 UTC (permalink / raw)
To: Jann Horn
Cc: Jens Axboe, Stefan Roesch, Linus Torvalds, io-uring,
linux-fsdevel, Pavel Begunkov
On Mon, Jan 03, 2022 at 08:03:51AM +0100, Jann Horn wrote:
> io_prep_rw() grabs file->f_pos; then later, io_read() calls
> io_iter_do_read() (which will fail with -EINVAL), and then the error
> path goes through kiocb_done(), which writes the position back to
> req->file->f_pos. So I think the following race might work:
Why does it touch ->f_pos on failure, anyway? It's a bug, plain and
simple; note that read(2) and write(2) are explicitly requested to
leave IO position alone if they return an error. See e.g.
fs/read_write.c:ksys_read() -
ret = vfs_read(f.file, buf, count, ppos);
if (ret >= 0 && ppos)
f.file->f_pos = pos;
fdput_pos(f);
Position update happens only on success (and only for non-stream
files, at that).
No matter how special io-uring is (it's not covered by POSIX, for
obvious reasons), this is simply wrong, directories or no directories.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v7 0/3] io_uring: add getdents64 support
2021-12-21 16:40 [PATCH v7 0/3] io_uring: add getdents64 support Stefan Roesch
` (3 preceding siblings ...)
2021-12-21 17:15 ` [PATCH v7 0/3] io_uring: add getdents64 support Linus Torvalds
@ 2021-12-21 19:17 ` Jens Axboe
2021-12-31 23:14 ` Al Viro
2023-04-16 22:06 ` Dominique Martinet
6 siblings, 0 replies; 14+ messages in thread
From: Jens Axboe @ 2021-12-21 19:17 UTC (permalink / raw)
To: linux-fsdevel, io-uring, Stefan Roesch; +Cc: torvalds
On Tue, 21 Dec 2021 08:40:01 -0800, Stefan Roesch wrote:
> This series adds support for getdents64 in liburing. The intent is to
> provide a more complete I/O interface for io_uring.
>
> Patch 1: fs: add offset parameter to iterate_dir function.
> This adds an offset parameter to the iterate_dir()
> function. The new parameter allows the caller to specify
> the offset to use.
>
> [...]
Applied, thanks!
[1/3] fs: add offset parameter to iterate_dir function
commit: 1533c1b579e1e866465fd9d04c8a5ebb1e25ba28
[2/3] fs: split off vfs_getdents function of getdents64 syscall
commit: 54d460de2423434e7aa9f7fcb9656230de53b85e
[3/3] io_uring: add support for getdents64
commit: b4518682080d3a1cdd6ea45a54ff6772b8b2797a
Best regards,
--
Jens Axboe
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v7 0/3] io_uring: add getdents64 support
2021-12-21 16:40 [PATCH v7 0/3] io_uring: add getdents64 support Stefan Roesch
` (4 preceding siblings ...)
2021-12-21 19:17 ` Jens Axboe
@ 2021-12-31 23:14 ` Al Viro
2023-04-16 22:06 ` Dominique Martinet
6 siblings, 0 replies; 14+ messages in thread
From: Al Viro @ 2021-12-31 23:14 UTC (permalink / raw)
To: Stefan Roesch; +Cc: io-uring, linux-fsdevel, torvalds
On Tue, Dec 21, 2021 at 08:40:01AM -0800, Stefan Roesch wrote:
> This series adds support for getdents64 in liburing. The intent is to
> provide a more complete I/O interface for io_uring.
>
> Patch 1: fs: add offset parameter to iterate_dir function.
> This adds an offset parameter to the iterate_dir()
> function. The new parameter allows the caller to specify
> the offset to use.
>
> Patch 2: fs: split off vfs_getdents function from getdents64 system call
> This splits of the iterate_dir part of the syscall in its own
> dedicated function. This allows to call the function directly from
> io_uring.
>
> Patch 3: io_uring: add support for getdents64
> Adds the functions to io_uring to support getdents64.
>
> There is also a patch series for the changes to liburing. This includes
> a new test. The patch series is called "liburing: add getdents support."
>
> The following tests have been performed:
> - new liburing getdents test program has been run
> - xfstests have been run
> - both tests have been repeated with the kernel memory leak checker
> and no leaks have been reported.
AFAICS, it completely breaks the "is the position known to be valid"
logics in a lot of ->iterate_dir() instances. For a reasonably simple
example see e.g. ext2_readdir():
bool need_revalidate = !inode_eq_iversion(inode, file->f_version);
.....
if (unlikely(need_revalidate)) {
if (offset) {
offset = ext2_validate_entry(kaddr, offset, chunk_mask);
ctx->pos = (n<<PAGE_SHIFT) + offset;
}
file->f_version = inode_query_iversion(inode);
need_revalidate = false;
}
and that, combined with
* directory modifications bumping iversion
* lseek explicitly cleaing ->f_version on location changes
and resulting position going back into ->f_pos, *BEFORE* we unlock the damn
file.
makes sure that we call ext2_validate_entry() for any untrusted position.
Your code breaks the living hell out of that. First of all, the offset is
completely arbitrary and no amount of struct file-based checks is going to
be relevant. Furthermore, this "we'd normalized the position, the valid
one will go into ->f_pos and do so before the caller does fdput_pos(), so
mark the file as known-good-position" logics also break - ->f_pos is *NOT*
locked and new positition, however valid it might be, is not going to
be put there.
How could that possibly work? I'm not saying that the current variant is
particularly nice, but the need to avoid revalidation of position on each
getdents(2) call is real, and this revalidation is not cheap.
Furthermore, how would that work on e.g. shmem/tmpfs/ramfs/etc.?
There the validation is not an issue, simply because the position is
not used at all - a per-file cursor is, and it's moved by dcache_dir_lseek()
and dcache_readdir().
Folks, readdir from arbitrary position had been a source of pain since
mid-80s. A plenty of PITA all around, and now you introduce another
API that shoves things into the same orifices without even a benefit of
going through lseek(2), or any of the exclusion warranties regarding
the file position?
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v7 0/3] io_uring: add getdents64 support
2021-12-21 16:40 [PATCH v7 0/3] io_uring: add getdents64 support Stefan Roesch
` (5 preceding siblings ...)
2021-12-31 23:14 ` Al Viro
@ 2023-04-16 22:06 ` Dominique Martinet
6 siblings, 0 replies; 14+ messages in thread
From: Dominique Martinet @ 2023-04-16 22:06 UTC (permalink / raw)
To: Stefan Roesch; +Cc: io-uring, linux-fsdevel, torvalds, Jens Axboe, Al Viro
Stefan Roesch wrote on Tue, Dec 21, 2021 at 08:40:01AM -0800:
> This series adds support for getdents64 in liburing. The intent is to
> provide a more complete I/O interface for io_uring.
[reminder: This series was dropped a year ago after Al Viro rightly
pointed out that we can't just pass an arbitrary offset to iterate_dir
as offset validation is costly and not always possible at all]
I'm digging an old topic here because I was looking at trying io_uring
on a toy project (specifically, exporting infos out of /sys/fs/cgroup
recursively), but was partly held back by the lack of getdents or
equivalent interface for the crawler part -- and existing bricks like
fts or nftw predate openat interfaces and just didn't appeal to me, but
in general just mixing in io_uring and synchronous getdents sounded a
bit of a pain.
Given it's been over a year I guess it's not such a much needed feature,
but when you're centering your program loop around the ring having the
occasional getdents/readdir call is a bit cumbersome.
This is probably a naive idea, but would it make sense discussing just
making it fit the current getdents API:
Given the problem seems to be revalidating the offset, since the main
usecase is probably to just go through a directory, perhaps the file's
offset could be updated just like the real call and offset validation be
skipped if the parameter is equal to the file's offset?
Giving a different offset would be equivalent to lseek + getdents and
update the position as well, so e.g. rewinding the directory with a seek
0 would work if an application needs to check a directory's content
multiple times.
Heck, seek to any offset other than 0 could just be refused for any sane
usage, it just doesn't make sense to use anything else in practice;
that'd make it a defacto "dumb getdents + rewinddir".
The API could be made simpler to use/clear about expectations by making
the last parameter "bool rewind_first" instead of an offset (or I guess
a flag with just a single valid bit if we were to really implement this)
This isn't very io_uring-like, but it'd allow for an io_uring-centric
program to also handle some directory processing, and wouldn't expose
anything that's not currently already possible to do (as long as each
processed op brings in its own context, the only "race" I can see in
iterate_dir is that file->f_pos can go back to a previously also valid
position if some iterations are faster than others, assuming it's an
atomic access, and that'd probably warrant a READ/WRITE_ONCE or
something.. but that's not a new problem, user threads can already
hammer getdents in parallel on a single fd if they want useless results)
What do you think?
I'm just asking with a toy in mind so it's not like I have any strong
opinion, but I'd be happy to rework Stefan's patches if it looks like it
might make sense.
(And if the consensus is that this will make hell break loose I'll just
forget about it before sinking more time in it, just catching up was fun
enough!)
Cheers,
--
Dominique Martinet | Asmadeus
^ permalink raw reply [flat|nested] 14+ messages in thread