* [RFC PATCH v2 01/16] block: add check for async buffered writes to generic_write_checks
2022-05-16 16:47 [RFC PATCH v2 00/16] io-uring/xfs: support async buffered writes Stefan Roesch
@ 2022-05-16 16:47 ` Stefan Roesch
2022-05-16 23:43 ` Matthew Wilcox
2022-05-16 16:47 ` [RFC PATCH v2 02/16] iomap: add iomap_page_create_gfp to allocate iomap_pages Stefan Roesch
` (14 subsequent siblings)
15 siblings, 1 reply; 22+ messages in thread
From: Stefan Roesch @ 2022-05-16 16:47 UTC (permalink / raw)
To: io-uring, kernel-team, linux-mm, linux-xfs, linux-fsdevel
Cc: shr, david, jack
This introduces the flag FMODE_BUF_WASYNC. If devices support async
buffered writes, this flag can be set. It also modifies the check in
generic_write_checks to take async buffered writes into consideration.
Signed-off-by: Stefan Roesch <[email protected]>
---
fs/read_write.c | 3 ++-
include/linux/fs.h | 3 +++
2 files changed, 5 insertions(+), 1 deletion(-)
diff --git a/fs/read_write.c b/fs/read_write.c
index e643aec2b0ef..f75d75f7bc84 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1633,7 +1633,8 @@ int generic_write_checks_count(struct kiocb *iocb, loff_t *count)
if (iocb->ki_flags & IOCB_APPEND)
iocb->ki_pos = i_size_read(inode);
- if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
+ if ((iocb->ki_flags & IOCB_NOWAIT) &&
+ !((iocb->ki_flags & IOCB_DIRECT) || (file->f_mode & FMODE_BUF_WASYNC)))
return -EINVAL;
return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index bbde95387a23..3b479d02e210 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -177,6 +177,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
/* File supports async buffered reads */
#define FMODE_BUF_RASYNC ((__force fmode_t)0x40000000)
+/* File supports async nowait buffered writes */
+#define FMODE_BUF_WASYNC ((__force fmode_t)0x80000000)
+
/*
* Attribute flags. These should be or-ed together to figure out what
* has been changed!
--
2.30.2
^ permalink raw reply related [flat|nested] 22+ messages in thread
* Re: [RFC PATCH v2 01/16] block: add check for async buffered writes to generic_write_checks
2022-05-16 16:47 ` [RFC PATCH v2 01/16] block: add check for async buffered writes to generic_write_checks Stefan Roesch
@ 2022-05-16 23:43 ` Matthew Wilcox
0 siblings, 0 replies; 22+ messages in thread
From: Matthew Wilcox @ 2022-05-16 23:43 UTC (permalink / raw)
To: Stefan Roesch
Cc: io-uring, kernel-team, linux-mm, linux-xfs, linux-fsdevel, david,
jack
On Mon, May 16, 2022 at 09:47:03AM -0700, Stefan Roesch wrote:
> - if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
> + if ((iocb->ki_flags & IOCB_NOWAIT) &&
> + !((iocb->ki_flags & IOCB_DIRECT) || (file->f_mode & FMODE_BUF_WASYNC)))
Please reduce your window width to 80 columns. It's exhausting trying to
read all this with the odd wrapping.
^ permalink raw reply [flat|nested] 22+ messages in thread
* [RFC PATCH v2 02/16] iomap: add iomap_page_create_gfp to allocate iomap_pages
2022-05-16 16:47 [RFC PATCH v2 00/16] io-uring/xfs: support async buffered writes Stefan Roesch
2022-05-16 16:47 ` [RFC PATCH v2 01/16] block: add check for async buffered writes to generic_write_checks Stefan Roesch
@ 2022-05-16 16:47 ` Stefan Roesch
2022-05-16 16:47 ` [RFC PATCH v2 03/16] iomap: use iomap_page_create_gfp() in __iomap_write_begin Stefan Roesch
` (13 subsequent siblings)
15 siblings, 0 replies; 22+ messages in thread
From: Stefan Roesch @ 2022-05-16 16:47 UTC (permalink / raw)
To: io-uring, kernel-team, linux-mm, linux-xfs, linux-fsdevel
Cc: shr, david, jack
Add the function iomap_page_create_gfp() to be able to specify gfp flags
and to pass in the number of blocks per folio in the function
iomap_page_create_gfp().
No intended functional changes in this patch.
Signed-off-by: Stefan Roesch <[email protected]>
---
fs/iomap/buffered-io.c | 34 ++++++++++++++++++++++++++++------
1 file changed, 28 insertions(+), 6 deletions(-)
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 8ce8720093b9..85aa32f50db0 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -43,17 +43,27 @@ static inline struct iomap_page *to_iomap_page(struct folio *folio)
static struct bio_set iomap_ioend_bioset;
+/**
+ * iomap_page_create_gfp : Create and initialize iomap_page for folio.
+ * @inode : Pointer to inode
+ * @folio : Pointer to folio
+ * @nr_blocks : Number of blocks in the folio
+ * @gfp : gfp allocation flags
+ *
+ * This function returns a newly allocated iomap for the folio with the settings
+ * specified in the gfp parameter.
+ *
+ **/
static struct iomap_page *
-iomap_page_create(struct inode *inode, struct folio *folio)
+iomap_page_create_gfp(struct inode *inode, struct folio *folio,
+ unsigned int nr_blocks, gfp_t gfp)
{
- struct iomap_page *iop = to_iomap_page(folio);
- unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
+ struct iomap_page *iop;
- if (iop || nr_blocks <= 1)
+ iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)), gfp);
+ if (!iop)
return iop;
- iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)),
- GFP_NOFS | __GFP_NOFAIL);
spin_lock_init(&iop->uptodate_lock);
if (folio_test_uptodate(folio))
bitmap_fill(iop->uptodate, nr_blocks);
@@ -61,6 +71,18 @@ iomap_page_create(struct inode *inode, struct folio *folio)
return iop;
}
+static struct iomap_page *
+iomap_page_create(struct inode *inode, struct folio *folio)
+{
+ struct iomap_page *iop = to_iomap_page(folio);
+ unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
+
+ if (iop || nr_blocks <= 1)
+ return iop;
+
+ return iomap_page_create_gfp(inode, folio, nr_blocks, GFP_NOFS | __GFP_NOFAIL);
+}
+
static void iomap_page_release(struct folio *folio)
{
struct iomap_page *iop = folio_detach_private(folio);
--
2.30.2
^ permalink raw reply related [flat|nested] 22+ messages in thread
* [RFC PATCH v2 03/16] iomap: use iomap_page_create_gfp() in __iomap_write_begin
2022-05-16 16:47 [RFC PATCH v2 00/16] io-uring/xfs: support async buffered writes Stefan Roesch
2022-05-16 16:47 ` [RFC PATCH v2 01/16] block: add check for async buffered writes to generic_write_checks Stefan Roesch
2022-05-16 16:47 ` [RFC PATCH v2 02/16] iomap: add iomap_page_create_gfp to allocate iomap_pages Stefan Roesch
@ 2022-05-16 16:47 ` Stefan Roesch
2022-05-16 23:58 ` Matthew Wilcox
2022-05-16 16:47 ` [RFC PATCH v2 04/16] iomap: add async buffered write support Stefan Roesch
` (12 subsequent siblings)
15 siblings, 1 reply; 22+ messages in thread
From: Stefan Roesch @ 2022-05-16 16:47 UTC (permalink / raw)
To: io-uring, kernel-team, linux-mm, linux-xfs, linux-fsdevel
Cc: shr, david, jack
This change uses the new iomap_page_create_gfp() function in the
function __iomap_write_begin().
No intended functional changes in this patch.
Signed-off-by: Stefan Roesch <[email protected]>
---
fs/iomap/buffered-io.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 85aa32f50db0..1ffdc7078e7d 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -572,17 +572,22 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
size_t len, struct folio *folio)
{
const struct iomap *srcmap = iomap_iter_srcmap(iter);
- struct iomap_page *iop = iomap_page_create(iter->inode, folio);
+ struct iomap_page *iop = to_iomap_page(folio);
loff_t block_size = i_blocksize(iter->inode);
loff_t block_start = round_down(pos, block_size);
loff_t block_end = round_up(pos + len, block_size);
+ unsigned int nr_blocks = i_blocks_per_folio(iter->inode, folio);
size_t from = offset_in_folio(folio, pos), to = from + len;
size_t poff, plen;
+ gfp_t gfp = GFP_NOFS | __GFP_NOFAIL;
if (folio_test_uptodate(folio))
return 0;
folio_clear_error(folio);
+ if (!iop && nr_blocks > 1)
+ iop = iomap_page_create_gfp(iter->inode, folio, nr_blocks, gfp);
+
do {
iomap_adjust_read_range(iter->inode, folio, &block_start,
block_end - block_start, &poff, &plen);
--
2.30.2
^ permalink raw reply related [flat|nested] 22+ messages in thread
* Re: [RFC PATCH v2 03/16] iomap: use iomap_page_create_gfp() in __iomap_write_begin
2022-05-16 16:47 ` [RFC PATCH v2 03/16] iomap: use iomap_page_create_gfp() in __iomap_write_begin Stefan Roesch
@ 2022-05-16 23:58 ` Matthew Wilcox
0 siblings, 0 replies; 22+ messages in thread
From: Matthew Wilcox @ 2022-05-16 23:58 UTC (permalink / raw)
To: Stefan Roesch
Cc: io-uring, kernel-team, linux-mm, linux-xfs, linux-fsdevel, david,
jack
On Mon, May 16, 2022 at 09:47:05AM -0700, Stefan Roesch wrote:
> This change uses the new iomap_page_create_gfp() function in the
> function __iomap_write_begin().
>
> No intended functional changes in this patch.
But there is one. I don't know if it's harmful or not.
> const struct iomap *srcmap = iomap_iter_srcmap(iter);
> - struct iomap_page *iop = iomap_page_create(iter->inode, folio);
> + struct iomap_page *iop = to_iomap_page(folio);
> loff_t block_size = i_blocksize(iter->inode);
> loff_t block_start = round_down(pos, block_size);
> loff_t block_end = round_up(pos + len, block_size);
> + unsigned int nr_blocks = i_blocks_per_folio(iter->inode, folio);
> size_t from = offset_in_folio(folio, pos), to = from + len;
> size_t poff, plen;
> + gfp_t gfp = GFP_NOFS | __GFP_NOFAIL;
For the case where the folio is already uptodate, would need an iop to
be written back (ie nr_blocks > 1) but doesn't have an iop, we used to
create one here, and now we don't.
How much testing has this seen with blocksize < 4k?
> if (folio_test_uptodate(folio))
> return 0;
> folio_clear_error(folio);
>
> + if (!iop && nr_blocks > 1)
> + iop = iomap_page_create_gfp(iter->inode, folio, nr_blocks, gfp);
> +
> do {
> iomap_adjust_read_range(iter->inode, folio, &block_start,
> block_end - block_start, &poff, &plen);
> --
> 2.30.2
>
>
^ permalink raw reply [flat|nested] 22+ messages in thread
* [RFC PATCH v2 04/16] iomap: add async buffered write support
2022-05-16 16:47 [RFC PATCH v2 00/16] io-uring/xfs: support async buffered writes Stefan Roesch
` (2 preceding siblings ...)
2022-05-16 16:47 ` [RFC PATCH v2 03/16] iomap: use iomap_page_create_gfp() in __iomap_write_begin Stefan Roesch
@ 2022-05-16 16:47 ` Stefan Roesch
2022-05-17 11:14 ` Jan Kara
2022-05-16 16:47 ` [RFC PATCH v2 05/16] xfs: add iomap " Stefan Roesch
` (11 subsequent siblings)
15 siblings, 1 reply; 22+ messages in thread
From: Stefan Roesch @ 2022-05-16 16:47 UTC (permalink / raw)
To: io-uring, kernel-team, linux-mm, linux-xfs, linux-fsdevel
Cc: shr, david, jack
This adds async buffered write support to iomap. The support is focused
on the changes necessary to support XFS with iomap.
Support for other filesystems might require additional changes.
Signed-off-by: Stefan Roesch <[email protected]>
---
fs/iomap/buffered-io.c | 21 ++++++++++++++++++++-
1 file changed, 20 insertions(+), 1 deletion(-)
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 1ffdc7078e7d..ceb3091f94c2 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -580,13 +580,20 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
size_t from = offset_in_folio(folio, pos), to = from + len;
size_t poff, plen;
gfp_t gfp = GFP_NOFS | __GFP_NOFAIL;
+ bool no_wait = (iter->flags & IOMAP_NOWAIT);
+
+ if (no_wait)
+ gfp = GFP_NOIO;
if (folio_test_uptodate(folio))
return 0;
folio_clear_error(folio);
- if (!iop && nr_blocks > 1)
+ if (!iop && nr_blocks > 1) {
iop = iomap_page_create_gfp(iter->inode, folio, nr_blocks, gfp);
+ if (no_wait && !iop)
+ return -EAGAIN;
+ }
do {
iomap_adjust_read_range(iter->inode, folio, &block_start,
@@ -603,6 +610,8 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
if (WARN_ON_ONCE(iter->flags & IOMAP_UNSHARE))
return -EIO;
folio_zero_segments(folio, poff, from, to, poff + plen);
+ } else if (no_wait) {
+ return -EAGAIN;
} else {
int status = iomap_read_folio_sync(block_start, folio,
poff, plen, srcmap);
@@ -633,6 +642,9 @@ static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS;
int status = 0;
+ if (iter->flags & IOMAP_NOWAIT)
+ fgp |= FGP_NOWAIT;
+
BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length);
if (srcmap != &iter->iomap)
BUG_ON(pos + len > srcmap->offset + srcmap->length);
@@ -790,6 +802,10 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
* Otherwise there's a nasty deadlock on copying from the
* same page as we're writing to, without it being marked
* up-to-date.
+ *
+ * For async buffered writes the assumption is that the user
+ * page has already been faulted in. This can be optimized by
+ * faulting the user page in the prepare phase of io-uring.
*/
if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) {
status = -EFAULT;
@@ -845,6 +861,9 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
};
int ret;
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ iter.flags |= IOMAP_NOWAIT;
+
while ((ret = iomap_iter(&iter, ops)) > 0)
iter.processed = iomap_write_iter(&iter, i);
if (iter.pos == iocb->ki_pos)
--
2.30.2
^ permalink raw reply related [flat|nested] 22+ messages in thread
* Re: [RFC PATCH v2 04/16] iomap: add async buffered write support
2022-05-16 16:47 ` [RFC PATCH v2 04/16] iomap: add async buffered write support Stefan Roesch
@ 2022-05-17 11:14 ` Jan Kara
0 siblings, 0 replies; 22+ messages in thread
From: Jan Kara @ 2022-05-17 11:14 UTC (permalink / raw)
To: Stefan Roesch
Cc: io-uring, kernel-team, linux-mm, linux-xfs, linux-fsdevel, david,
jack
On Mon 16-05-22 09:47:06, Stefan Roesch wrote:
> This adds async buffered write support to iomap. The support is focused
> on the changes necessary to support XFS with iomap.
>
> Support for other filesystems might require additional changes.
>
> Signed-off-by: Stefan Roesch <[email protected]>
> ---
> fs/iomap/buffered-io.c | 21 ++++++++++++++++++++-
> 1 file changed, 20 insertions(+), 1 deletion(-)
>
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index 1ffdc7078e7d..ceb3091f94c2 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -580,13 +580,20 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
> size_t from = offset_in_folio(folio, pos), to = from + len;
> size_t poff, plen;
> gfp_t gfp = GFP_NOFS | __GFP_NOFAIL;
> + bool no_wait = (iter->flags & IOMAP_NOWAIT);
> +
> + if (no_wait)
> + gfp = GFP_NOIO;
GFP_NOIO means that direct reclaim is still allowed. Not sure whether you
want to enter direct reclaim from io_uring fast path because in theory that
can still sleep. GFP_NOWAIT would be a more natural choice...
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
^ permalink raw reply [flat|nested] 22+ messages in thread
* [RFC PATCH v2 05/16] xfs: add iomap async buffered write support
2022-05-16 16:47 [RFC PATCH v2 00/16] io-uring/xfs: support async buffered writes Stefan Roesch
` (3 preceding siblings ...)
2022-05-16 16:47 ` [RFC PATCH v2 04/16] iomap: add async buffered write support Stefan Roesch
@ 2022-05-16 16:47 ` Stefan Roesch
2022-05-16 16:47 ` [RFC PATCH v2 06/16] fs: split off need_remove_file_privs() do_remove_file_privs() Stefan Roesch
` (10 subsequent siblings)
15 siblings, 0 replies; 22+ messages in thread
From: Stefan Roesch @ 2022-05-16 16:47 UTC (permalink / raw)
To: io-uring, kernel-team, linux-mm, linux-xfs, linux-fsdevel
Cc: shr, david, jack
This adds the async buffered write support to the iomap layer of XFS. If
a lock cannot be acquired or additional reads need to be performed, the
request will return -EAGAIN in case this is an async buffered write request.
This patch changes the helper function xfs_ilock_for_iomap such that the
lock mode needs to be passed in.
Signed-off-by: Stefan Roesch <[email protected]>
---
fs/xfs/xfs_iomap.c | 14 +++++++++-----
1 file changed, 9 insertions(+), 5 deletions(-)
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index e552ce541ec2..1aea962262ad 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -659,7 +659,7 @@ xfs_ilock_for_iomap(
unsigned flags,
unsigned *lockmode)
{
- unsigned mode = XFS_ILOCK_SHARED;
+ unsigned int mode = *lockmode;
bool is_write = flags & (IOMAP_WRITE | IOMAP_ZERO);
/*
@@ -737,7 +737,7 @@ xfs_direct_write_iomap_begin(
int nimaps = 1, error = 0;
bool shared = false;
u16 iomap_flags = 0;
- unsigned lockmode;
+ unsigned int lockmode = XFS_ILOCK_SHARED;
ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO));
@@ -881,18 +881,22 @@ xfs_buffered_write_iomap_begin(
bool eof = false, cow_eof = false, shared = false;
int allocfork = XFS_DATA_FORK;
int error = 0;
+ unsigned int lockmode = XFS_ILOCK_EXCL;
if (xfs_is_shutdown(mp))
return -EIO;
/* we can't use delayed allocations when using extent size hints */
- if (xfs_get_extsz_hint(ip))
+ if (xfs_get_extsz_hint(ip)) {
return xfs_direct_write_iomap_begin(inode, offset, count,
flags, iomap, srcmap);
+ }
ASSERT(!XFS_IS_REALTIME_INODE(ip));
- xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_ilock_for_iomap(ip, flags, &lockmode);
+ if (error)
+ return error;
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) ||
XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
@@ -1167,7 +1171,7 @@ xfs_read_iomap_begin(
xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length);
int nimaps = 1, error = 0;
bool shared = false;
- unsigned lockmode;
+ unsigned int lockmode = XFS_ILOCK_SHARED;
ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO)));
--
2.30.2
^ permalink raw reply related [flat|nested] 22+ messages in thread
* [RFC PATCH v2 06/16] fs: split off need_remove_file_privs() do_remove_file_privs()
2022-05-16 16:47 [RFC PATCH v2 00/16] io-uring/xfs: support async buffered writes Stefan Roesch
` (4 preceding siblings ...)
2022-05-16 16:47 ` [RFC PATCH v2 05/16] xfs: add iomap " Stefan Roesch
@ 2022-05-16 16:47 ` Stefan Roesch
2022-05-16 16:47 ` [RFC PATCH v2 07/16] fs: split off need_file_update_time and do_file_update_time Stefan Roesch
` (9 subsequent siblings)
15 siblings, 0 replies; 22+ messages in thread
From: Stefan Roesch @ 2022-05-16 16:47 UTC (permalink / raw)
To: io-uring, kernel-team, linux-mm, linux-xfs, linux-fsdevel
Cc: shr, david, jack
This splits off the function need_remove_file_privs() from the function
do_remove_file_privs() from the function file_remove_privs().
No intended functional changes in this patch.
Signed-off-by: Stefan Roesch <[email protected]>
---
fs/inode.c | 57 ++++++++++++++++++++++++++++++++++++------------------
1 file changed, 38 insertions(+), 19 deletions(-)
diff --git a/fs/inode.c b/fs/inode.c
index 9d9b422504d1..a6d70a1983f8 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2010,17 +2010,8 @@ static int __remove_privs(struct user_namespace *mnt_userns,
return notify_change(mnt_userns, dentry, &newattrs, NULL);
}
-/*
- * Remove special file priviledges (suid, capabilities) when file is written
- * to or truncated.
- */
-int file_remove_privs(struct file *file)
+static int need_file_remove_privs(struct inode *inode, struct dentry *dentry)
{
- struct dentry *dentry = file_dentry(file);
- struct inode *inode = file_inode(file);
- int kill;
- int error = 0;
-
/*
* Fast path for nothing security related.
* As well for non-regular files, e.g. blkdev inodes.
@@ -2030,16 +2021,37 @@ int file_remove_privs(struct file *file)
if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode))
return 0;
- kill = dentry_needs_remove_privs(dentry);
- if (kill < 0)
- return kill;
- if (kill)
- error = __remove_privs(file_mnt_user_ns(file), dentry, kill);
+ return dentry_needs_remove_privs(dentry);
+}
+
+static int do_file_remove_privs(struct file *file, struct inode *inode,
+ struct dentry *dentry, int kill)
+{
+ int error = 0;
+
+ error = __remove_privs(file_mnt_user_ns(file), dentry, kill);
if (!error)
inode_has_no_xattr(inode);
return error;
}
+
+/*
+ * Remove special file privileges (suid, capabilities) when file is written
+ * to or truncated.
+ */
+int file_remove_privs(struct file *file)
+{
+ struct dentry *dentry = file_dentry(file);
+ struct inode *inode = file_inode(file);
+ int kill;
+
+ kill = need_file_remove_privs(inode, dentry);
+ if (kill <= 0)
+ return kill;
+
+ return do_file_remove_privs(file, inode, dentry, kill);
+}
EXPORT_SYMBOL(file_remove_privs);
/**
@@ -2093,15 +2105,22 @@ EXPORT_SYMBOL(file_update_time);
/* Caller must hold the file's inode lock */
int file_modified(struct file *file)
{
- int err;
+ int ret;
+ struct dentry *dentry = file_dentry(file);
+ struct inode *inode = file_inode(file);
/*
* Clear the security bits if the process is not being run by root.
* This keeps people from modifying setuid and setgid binaries.
*/
- err = file_remove_privs(file);
- if (err)
- return err;
+ ret = need_file_remove_privs(inode, dentry);
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0) {
+ ret = do_file_remove_privs(file, inode, dentry, ret);
+ if (ret)
+ return ret;
+ }
if (unlikely(file->f_mode & FMODE_NOCMTIME))
return 0;
--
2.30.2
^ permalink raw reply related [flat|nested] 22+ messages in thread
* [RFC PATCH v2 07/16] fs: split off need_file_update_time and do_file_update_time
2022-05-16 16:47 [RFC PATCH v2 00/16] io-uring/xfs: support async buffered writes Stefan Roesch
` (5 preceding siblings ...)
2022-05-16 16:47 ` [RFC PATCH v2 06/16] fs: split off need_remove_file_privs() do_remove_file_privs() Stefan Roesch
@ 2022-05-16 16:47 ` Stefan Roesch
2022-05-17 11:20 ` Jan Kara
2022-05-16 16:47 ` [RFC PATCH v2 08/16] fs: add pending file update time flag Stefan Roesch
` (8 subsequent siblings)
15 siblings, 1 reply; 22+ messages in thread
From: Stefan Roesch @ 2022-05-16 16:47 UTC (permalink / raw)
To: io-uring, kernel-team, linux-mm, linux-xfs, linux-fsdevel
Cc: shr, david, jack
This splits off the functions need_file_update_time() and
do_file_update_time() from the function file_update_time().
This is required to support async buffered writes.
No intended functional changes in this patch.
Signed-off-by: Stefan Roesch <[email protected]>
---
fs/inode.c | 71 ++++++++++++++++++++++++++++++++++++------------------
1 file changed, 47 insertions(+), 24 deletions(-)
diff --git a/fs/inode.c b/fs/inode.c
index a6d70a1983f8..1d0b02763e98 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2054,35 +2054,22 @@ int file_remove_privs(struct file *file)
}
EXPORT_SYMBOL(file_remove_privs);
-/**
- * file_update_time - update mtime and ctime time
- * @file: file accessed
- *
- * Update the mtime and ctime members of an inode and mark the inode
- * for writeback. Note that this function is meant exclusively for
- * usage in the file write path of filesystems, and filesystems may
- * choose to explicitly ignore update via this function with the
- * S_NOCMTIME inode flag, e.g. for network filesystem where these
- * timestamps are handled by the server. This can return an error for
- * file systems who need to allocate space in order to update an inode.
- */
-
-int file_update_time(struct file *file)
+static int need_file_update_time(struct inode *inode, struct file *file,
+ struct timespec64 *now)
{
- struct inode *inode = file_inode(file);
- struct timespec64 now;
int sync_it = 0;
- int ret;
+
+ if (unlikely(file->f_mode & FMODE_NOCMTIME))
+ return 0;
/* First try to exhaust all avenues to not sync */
if (IS_NOCMTIME(inode))
return 0;
- now = current_time(inode);
- if (!timespec64_equal(&inode->i_mtime, &now))
+ if (!timespec64_equal(&inode->i_mtime, now))
sync_it = S_MTIME;
- if (!timespec64_equal(&inode->i_ctime, &now))
+ if (!timespec64_equal(&inode->i_ctime, now))
sync_it |= S_CTIME;
if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode))
@@ -2091,15 +2078,49 @@ int file_update_time(struct file *file)
if (!sync_it)
return 0;
+ return sync_it;
+}
+
+static int do_file_update_time(struct inode *inode, struct file *file,
+ struct timespec64 *now, int sync_mode)
+{
+ int ret;
+
/* Finally allowed to write? Takes lock. */
if (__mnt_want_write_file(file))
return 0;
- ret = inode_update_time(inode, &now, sync_it);
+ ret = inode_update_time(inode, now, sync_mode);
__mnt_drop_write_file(file);
return ret;
}
+
+/**
+ * file_update_time - update mtime and ctime time
+ * @file: file accessed
+ *
+ * Update the mtime and ctime members of an inode and mark the inode
+ * for writeback. Note that this function is meant exclusively for
+ * usage in the file write path of filesystems, and filesystems may
+ * choose to explicitly ignore update via this function with the
+ * S_NOCMTIME inode flag, e.g. for network filesystem where these
+ * timestamps are handled by the server. This can return an error for
+ * file systems who need to allocate space in order to update an inode.
+ */
+
+int file_update_time(struct file *file)
+{
+ int err;
+ struct inode *inode = file_inode(file);
+ struct timespec64 now = current_time(inode);
+
+ err = need_file_update_time(inode, file, &now);
+ if (err < 0)
+ return err;
+
+ return do_file_update_time(inode, file, &now, err);
+}
EXPORT_SYMBOL(file_update_time);
/* Caller must hold the file's inode lock */
@@ -2108,6 +2129,7 @@ int file_modified(struct file *file)
int ret;
struct dentry *dentry = file_dentry(file);
struct inode *inode = file_inode(file);
+ struct timespec64 now = current_time(inode);
/*
* Clear the security bits if the process is not being run by root.
@@ -2122,10 +2144,11 @@ int file_modified(struct file *file)
return ret;
}
- if (unlikely(file->f_mode & FMODE_NOCMTIME))
- return 0;
+ ret = need_file_update_time(inode, file, &now);
+ if (ret <= 0)
+ return ret;
- return file_update_time(file);
+ return do_file_update_time(inode, file, &now, ret);
}
EXPORT_SYMBOL(file_modified);
--
2.30.2
^ permalink raw reply related [flat|nested] 22+ messages in thread
* Re: [RFC PATCH v2 07/16] fs: split off need_file_update_time and do_file_update_time
2022-05-16 16:47 ` [RFC PATCH v2 07/16] fs: split off need_file_update_time and do_file_update_time Stefan Roesch
@ 2022-05-17 11:20 ` Jan Kara
0 siblings, 0 replies; 22+ messages in thread
From: Jan Kara @ 2022-05-17 11:20 UTC (permalink / raw)
To: Stefan Roesch
Cc: io-uring, kernel-team, linux-mm, linux-xfs, linux-fsdevel, david,
jack
On Mon 16-05-22 09:47:09, Stefan Roesch wrote:
> This splits off the functions need_file_update_time() and
> do_file_update_time() from the function file_update_time().
>
> This is required to support async buffered writes.
> No intended functional changes in this patch.
>
> Signed-off-by: Stefan Roesch <[email protected]>
...
> +int file_update_time(struct file *file)
> +{
> + int err;
> + struct inode *inode = file_inode(file);
> + struct timespec64 now = current_time(inode);
> +
> + err = need_file_update_time(inode, file, &now);
> + if (err < 0)
> + return err;
> +
> + return do_file_update_time(inode, file, &now, err);
> +}
> EXPORT_SYMBOL(file_update_time);
I guess 'ret' would be a more appropriate variable name than 'err'.
Otherwise the patch looks good.
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
^ permalink raw reply [flat|nested] 22+ messages in thread
* [RFC PATCH v2 08/16] fs: add pending file update time flag.
2022-05-16 16:47 [RFC PATCH v2 00/16] io-uring/xfs: support async buffered writes Stefan Roesch
` (6 preceding siblings ...)
2022-05-16 16:47 ` [RFC PATCH v2 07/16] fs: split off need_file_update_time and do_file_update_time Stefan Roesch
@ 2022-05-16 16:47 ` Stefan Roesch
2022-05-17 11:28 ` Jan Kara
2022-05-16 16:47 ` [RFC PATCH v2 09/16] xfs: enable async write file modification handling Stefan Roesch
` (7 subsequent siblings)
15 siblings, 1 reply; 22+ messages in thread
From: Stefan Roesch @ 2022-05-16 16:47 UTC (permalink / raw)
To: io-uring, kernel-team, linux-mm, linux-xfs, linux-fsdevel
Cc: shr, david, jack
This introduces an optimization for the update time flag and async
buffered writes. While an update of the file modification time is
pending and is handled by the workers, concurrent writes do not need
to wait for this time update to complete.
Signed-off-by: Stefan Roesch <[email protected]>
---
fs/inode.c | 1 +
include/linux/fs.h | 3 +++
2 files changed, 4 insertions(+)
diff --git a/fs/inode.c b/fs/inode.c
index 1d0b02763e98..fd18b2c1b7c4 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2091,6 +2091,7 @@ static int do_file_update_time(struct inode *inode, struct file *file,
return 0;
ret = inode_update_time(inode, now, sync_mode);
+ inode->i_flags &= ~S_PENDING_TIME;
__mnt_drop_write_file(file);
return ret;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3b479d02e210..3da641dfa6d9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2141,6 +2141,8 @@ struct super_operations {
#define S_CASEFOLD (1 << 15) /* Casefolded file */
#define S_VERITY (1 << 16) /* Verity file (using fs/verity/) */
#define S_KERNEL_FILE (1 << 17) /* File is in use by the kernel (eg. fs/cachefiles) */
+#define S_PENDING_TIME (1 << 18) /* File update time is pending */
+
/*
* Note that nosuid etc flags are inode-specific: setting some file-system
@@ -2183,6 +2185,7 @@ static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags
#define IS_ENCRYPTED(inode) ((inode)->i_flags & S_ENCRYPTED)
#define IS_CASEFOLDED(inode) ((inode)->i_flags & S_CASEFOLD)
#define IS_VERITY(inode) ((inode)->i_flags & S_VERITY)
+#define IS_PENDING_TIME(inode) ((inode)->i_flags & S_PENDING_TIME)
#define IS_WHITEOUT(inode) (S_ISCHR(inode->i_mode) && \
(inode)->i_rdev == WHITEOUT_DEV)
--
2.30.2
^ permalink raw reply related [flat|nested] 22+ messages in thread
* Re: [RFC PATCH v2 08/16] fs: add pending file update time flag.
2022-05-16 16:47 ` [RFC PATCH v2 08/16] fs: add pending file update time flag Stefan Roesch
@ 2022-05-17 11:28 ` Jan Kara
0 siblings, 0 replies; 22+ messages in thread
From: Jan Kara @ 2022-05-17 11:28 UTC (permalink / raw)
To: Stefan Roesch
Cc: io-uring, kernel-team, linux-mm, linux-xfs, linux-fsdevel, david,
jack
On Mon 16-05-22 09:47:10, Stefan Roesch wrote:
> This introduces an optimization for the update time flag and async
> buffered writes. While an update of the file modification time is
> pending and is handled by the workers, concurrent writes do not need
> to wait for this time update to complete.
>
> Signed-off-by: Stefan Roesch <[email protected]>
> ---
> fs/inode.c | 1 +
> include/linux/fs.h | 3 +++
> 2 files changed, 4 insertions(+)
>
> diff --git a/fs/inode.c b/fs/inode.c
> index 1d0b02763e98..fd18b2c1b7c4 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -2091,6 +2091,7 @@ static int do_file_update_time(struct inode *inode, struct file *file,
> return 0;
>
> ret = inode_update_time(inode, now, sync_mode);
> + inode->i_flags &= ~S_PENDING_TIME;
So what protects this update of inode->i_flags? Usually we use
inode->i_rwsem for that but not all file_update_time() callers hold it...
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
^ permalink raw reply [flat|nested] 22+ messages in thread
* [RFC PATCH v2 09/16] xfs: enable async write file modification handling.
2022-05-16 16:47 [RFC PATCH v2 00/16] io-uring/xfs: support async buffered writes Stefan Roesch
` (7 preceding siblings ...)
2022-05-16 16:47 ` [RFC PATCH v2 08/16] fs: add pending file update time flag Stefan Roesch
@ 2022-05-16 16:47 ` Stefan Roesch
2022-05-16 16:47 ` [RFC PATCH v2 10/16] xfs: add async buffered write support Stefan Roesch
` (6 subsequent siblings)
15 siblings, 0 replies; 22+ messages in thread
From: Stefan Roesch @ 2022-05-16 16:47 UTC (permalink / raw)
To: io-uring, kernel-team, linux-mm, linux-xfs, linux-fsdevel
Cc: shr, david, jack
This modifies xfs write checks to return -EAGAIN if the request either
requires to remove privileges or needs to update the file modification
time. This is required for async buffered writes, so the request gets
handled in the io worker.
Signed-off-by: Stefan Roesch <[email protected]>
---
fs/inode.c | 19 ++++++++++++++++++-
fs/xfs/xfs_file.c | 2 +-
include/linux/fs.h | 1 +
3 files changed, 20 insertions(+), 2 deletions(-)
diff --git a/fs/inode.c b/fs/inode.c
index fd18b2c1b7c4..40941feaec8d 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2126,6 +2126,13 @@ EXPORT_SYMBOL(file_update_time);
/* Caller must hold the file's inode lock */
int file_modified(struct file *file)
+{
+ return file_modified_async(file, 0);
+}
+EXPORT_SYMBOL(file_modified);
+
+/* Caller must hold the file's inode lock */
+int file_modified_async(struct file *file, int flags)
{
int ret;
struct dentry *dentry = file_dentry(file);
@@ -2140,6 +2147,9 @@ int file_modified(struct file *file)
if (ret < 0) {
return ret;
} else if (ret > 0) {
+ if (flags & IOCB_NOWAIT)
+ return -EAGAIN;
+
ret = do_file_remove_privs(file, inode, dentry, ret);
if (ret)
return ret;
@@ -2148,10 +2158,17 @@ int file_modified(struct file *file)
ret = need_file_update_time(inode, file, &now);
if (ret <= 0)
return ret;
+ if (flags & IOCB_NOWAIT) {
+ if (IS_PENDING_TIME(inode))
+ return 0;
+
+ inode->i_flags |= S_PENDING_TIME;
+ return -EAGAIN;
+ }
return do_file_update_time(inode, file, &now, ret);
}
-EXPORT_SYMBOL(file_modified);
+EXPORT_SYMBOL(file_modified_async);
int inode_needs_sync(struct inode *inode)
{
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 5bddb1e9e0b3..793918c83755 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -410,7 +410,7 @@ xfs_file_write_checks(
spin_unlock(&ip->i_flags_lock);
out:
- return file_modified(file);
+ return file_modified_async(file, iocb->ki_flags);
}
static int
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3da641dfa6d9..5f3aaf61fb4b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2385,6 +2385,7 @@ static inline void file_accessed(struct file *file)
}
extern int file_modified(struct file *file);
+extern int file_modified_async(struct file *file, int flags);
int sync_inode_metadata(struct inode *inode, int wait);
--
2.30.2
^ permalink raw reply related [flat|nested] 22+ messages in thread
* [RFC PATCH v2 10/16] xfs: add async buffered write support
2022-05-16 16:47 [RFC PATCH v2 00/16] io-uring/xfs: support async buffered writes Stefan Roesch
` (8 preceding siblings ...)
2022-05-16 16:47 ` [RFC PATCH v2 09/16] xfs: enable async write file modification handling Stefan Roesch
@ 2022-05-16 16:47 ` Stefan Roesch
2022-05-16 16:47 ` [RFC PATCH v2 11/16] io_uring: add support for async buffered writes Stefan Roesch
` (5 subsequent siblings)
15 siblings, 0 replies; 22+ messages in thread
From: Stefan Roesch @ 2022-05-16 16:47 UTC (permalink / raw)
To: io-uring, kernel-team, linux-mm, linux-xfs, linux-fsdevel
Cc: shr, david, jack
This adds the async buffered write support to XFS. For async buffered
write requests, the request will return -EAGAIN if the ilock cannot be
obtained immediately.
This splits off a new helper xfs_ilock_inode from the existing helper
xfs_ilock_iocb so it can be used for this function. The exising helper
cannot be used as it hardcoded the inode to be used.
Signed-off-by: Stefan Roesch <[email protected]>
---
fs/xfs/xfs_file.c | 32 +++++++++++++++-----------------
1 file changed, 15 insertions(+), 17 deletions(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 793918c83755..ad3175b7d366 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -190,14 +190,13 @@ xfs_file_fsync(
return error;
}
-static int
-xfs_ilock_iocb(
- struct kiocb *iocb,
+static inline int
+xfs_ilock_xfs_inode(
+ struct xfs_inode *ip,
+ int flags,
unsigned int lock_mode)
{
- struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
-
- if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (flags & IOCB_NOWAIT) {
if (!xfs_ilock_nowait(ip, lock_mode))
return -EAGAIN;
} else {
@@ -222,7 +221,7 @@ xfs_file_dio_read(
file_accessed(iocb->ki_filp);
- ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
+ ret = xfs_ilock_xfs_inode(ip, iocb->ki_flags, XFS_IOLOCK_SHARED);
if (ret)
return ret;
ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, 0);
@@ -244,7 +243,7 @@ xfs_file_dax_read(
if (!iov_iter_count(to))
return 0; /* skip atime */
- ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
+ ret = xfs_ilock_xfs_inode(ip, iocb->ki_flags, XFS_IOLOCK_SHARED);
if (ret)
return ret;
ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
@@ -264,7 +263,7 @@ xfs_file_buffered_read(
trace_xfs_file_buffered_read(iocb, to);
- ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
+ ret = xfs_ilock_xfs_inode(ip, iocb->ki_flags, XFS_IOLOCK_SHARED);
if (ret)
return ret;
ret = generic_file_read_iter(iocb, to);
@@ -343,7 +342,7 @@ xfs_file_write_checks(
if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
xfs_iunlock(ip, *iolock);
*iolock = XFS_IOLOCK_EXCL;
- error = xfs_ilock_iocb(iocb, *iolock);
+ error = xfs_ilock_xfs_inode(ip, iocb->ki_flags, *iolock);
if (error) {
*iolock = 0;
return error;
@@ -516,7 +515,7 @@ xfs_file_dio_write_aligned(
int iolock = XFS_IOLOCK_SHARED;
ssize_t ret;
- ret = xfs_ilock_iocb(iocb, iolock);
+ ret = xfs_ilock_xfs_inode(ip, iocb->ki_flags, iolock);
if (ret)
return ret;
ret = xfs_file_write_checks(iocb, from, &iolock);
@@ -583,7 +582,7 @@ xfs_file_dio_write_unaligned(
flags = IOMAP_DIO_FORCE_WAIT;
}
- ret = xfs_ilock_iocb(iocb, iolock);
+ ret = xfs_ilock_xfs_inode(ip, iocb->ki_flags, iolock);
if (ret)
return ret;
@@ -659,7 +658,7 @@ xfs_file_dax_write(
ssize_t ret, error = 0;
loff_t pos;
- ret = xfs_ilock_iocb(iocb, iolock);
+ ret = xfs_ilock_xfs_inode(ip, iocb->ki_flags, iolock);
if (ret)
return ret;
ret = xfs_file_write_checks(iocb, from, &iolock);
@@ -702,12 +701,11 @@ xfs_file_buffered_write(
bool cleared_space = false;
int iolock;
- if (iocb->ki_flags & IOCB_NOWAIT)
- return -EOPNOTSUPP;
-
write_retry:
iolock = XFS_IOLOCK_EXCL;
- xfs_ilock(ip, iolock);
+ ret = xfs_ilock_xfs_inode(ip, iocb->ki_flags, iolock);
+ if (ret)
+ return ret;
ret = xfs_file_write_checks(iocb, from, &iolock);
if (ret)
--
2.30.2
^ permalink raw reply related [flat|nested] 22+ messages in thread
* [RFC PATCH v2 11/16] io_uring: add support for async buffered writes
2022-05-16 16:47 [RFC PATCH v2 00/16] io-uring/xfs: support async buffered writes Stefan Roesch
` (9 preceding siblings ...)
2022-05-16 16:47 ` [RFC PATCH v2 10/16] xfs: add async buffered write support Stefan Roesch
@ 2022-05-16 16:47 ` Stefan Roesch
2022-05-16 16:47 ` [RFC PATCH v2 12/16] mm: factor out _balance_dirty_pages() from balance_dirty_pages() Stefan Roesch
` (4 subsequent siblings)
15 siblings, 0 replies; 22+ messages in thread
From: Stefan Roesch @ 2022-05-16 16:47 UTC (permalink / raw)
To: io-uring, kernel-team, linux-mm, linux-xfs, linux-fsdevel
Cc: shr, david, jack
This enables the async buffered writes for the filesystems that support
async buffered writes in io-uring. Buffered writes are enabled for
blocks that are already in the page cache or can be acquired with noio.
Signed-off-by: Stefan Roesch <[email protected]>
---
fs/io_uring.c | 29 ++++++++++++++++++++++++-----
1 file changed, 24 insertions(+), 5 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 91de361ea9ab..f3aaac286509 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3746,7 +3746,7 @@ static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
return -EINVAL;
}
-static bool need_read_all(struct io_kiocb *req)
+static bool need_complete_io(struct io_kiocb *req)
{
return req->flags & REQ_F_ISREG ||
S_ISBLK(file_inode(req->file)->i_mode);
@@ -3875,7 +3875,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
} else if (ret == -EIOCBQUEUED) {
goto out_free;
} else if (ret == req->result || ret <= 0 || !force_nonblock ||
- (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {
+ (req->flags & REQ_F_NOWAIT) || !need_complete_io(req)) {
/* read all, failed, already did sync or don't want to retry */
goto done;
}
@@ -3971,9 +3971,10 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
if (unlikely(!io_file_supports_nowait(req)))
goto copy_iov;
- /* file path doesn't support NOWAIT for non-direct_IO */
- if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
- (req->flags & REQ_F_ISREG))
+ /* File path supports NOWAIT for non-direct_IO only for block devices. */
+ if (!(kiocb->ki_flags & IOCB_DIRECT) &&
+ !(kiocb->ki_filp->f_mode & FMODE_BUF_WASYNC) &&
+ (req->flags & REQ_F_ISREG))
goto copy_iov;
kiocb->ki_flags |= IOCB_NOWAIT;
@@ -4027,6 +4028,24 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
/* IOPOLL retry should happen for io-wq threads */
if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
goto copy_iov;
+
+ if (ret2 != req->result && ret2 >= 0 && need_complete_io(req)) {
+ struct io_async_rw *rw;
+
+ /* This is a partial write. The file pos has already been
+ * updated, setup the async struct to complete the request
+ * in the worker. Also update bytes_done to account for
+ * the bytes already written.
+ */
+ iov_iter_save_state(&s->iter, &s->iter_state);
+ ret = io_setup_async_rw(req, iovec, s, true);
+
+ rw = req->async_data;
+ if (rw)
+ rw->bytes_done += ret2;
+
+ return ret ? ret : -EAGAIN;
+ }
done:
kiocb_done(req, ret2, issue_flags);
} else {
--
2.30.2
^ permalink raw reply related [flat|nested] 22+ messages in thread
* [RFC PATCH v2 12/16] mm: factor out _balance_dirty_pages() from balance_dirty_pages()
2022-05-16 16:47 [RFC PATCH v2 00/16] io-uring/xfs: support async buffered writes Stefan Roesch
` (10 preceding siblings ...)
2022-05-16 16:47 ` [RFC PATCH v2 11/16] io_uring: add support for async buffered writes Stefan Roesch
@ 2022-05-16 16:47 ` Stefan Roesch
2022-05-16 16:47 ` [RFC PATCH v2 13/16] mm: add balance_dirty_pages_ratelimited_flags() function Stefan Roesch
` (3 subsequent siblings)
15 siblings, 0 replies; 22+ messages in thread
From: Stefan Roesch @ 2022-05-16 16:47 UTC (permalink / raw)
To: io-uring, kernel-team, linux-mm, linux-xfs, linux-fsdevel
Cc: shr, david, jack
This factors out the for loop in balance_dirty_pages() into a new
function called _balance_dirty_pages(). By factoring out this function
the async write code can determine if it has to wait to throttle writes
or not. The function _balance_dirty_pages() returns the sleep time.
If the sleep time is greater 0, then the async write code needs to throttle.
To maintain the context for consecutive calls of _balance_dirty_pages()
and maintain the current behavior a new data structure called bdp_ctx
has been introduced.
Signed-off-by: Stefan Roesch <[email protected]>
---
mm/page-writeback.c | 452 +++++++++++++++++++++++---------------------
1 file changed, 239 insertions(+), 213 deletions(-)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 7e2da284e427..cbb74c0666c6 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -140,6 +140,29 @@ struct dirty_throttle_control {
unsigned long pos_ratio;
};
+/* context for consecutive calls to _balance_dirty_pages() */
+struct bdp_ctx {
+ long pause;
+ unsigned long now;
+ unsigned long start_time;
+ unsigned long task_ratelimit;
+ unsigned long dirty_ratelimit;
+ unsigned long nr_reclaimable;
+ int nr_dirtied_pause;
+ bool dirty_exceeded;
+
+ struct dirty_throttle_control gdtc_stor;
+ struct dirty_throttle_control mdtc_stor;
+ struct dirty_throttle_control *sdtc;
+} bdp_ctx;
+
+/* initialize _balance_dirty_pages() context */
+#define BDP_CTX_INIT(ctx, wb) \
+ .gdtc_stor = { GDTC_INIT(wb) }, \
+ .mdtc_stor = { MDTC_INIT(wb, &ctx.gdtc_stor) }, \
+ .start_time = jiffies, \
+ .dirty_exceeded = false
+
/*
* Length of period for aging writeout fractions of bdis. This is an
* arbitrarily chosen number. The longer the period, the slower fractions will
@@ -1538,261 +1561,264 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
}
}
-/*
- * balance_dirty_pages() must be called by processes which are generating dirty
- * data. It looks at the number of dirty pages in the machine and will force
- * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2.
- * If we're over `background_thresh' then the writeback threads are woken to
- * perform some writeout.
- */
-static void balance_dirty_pages(struct bdi_writeback *wb,
- unsigned long pages_dirtied)
+static inline int _balance_dirty_pages(struct bdi_writeback *wb,
+ unsigned long pages_dirtied, struct bdp_ctx *ctx)
{
- struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
- struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
- struct dirty_throttle_control * const gdtc = &gdtc_stor;
- struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
- &mdtc_stor : NULL;
- struct dirty_throttle_control *sdtc;
- unsigned long nr_reclaimable; /* = file_dirty */
+ struct dirty_throttle_control * const gdtc = &ctx->gdtc_stor;
+ struct dirty_throttle_control * const mdtc = mdtc_valid(&ctx->mdtc_stor) ?
+ &ctx->mdtc_stor : NULL;
long period;
- long pause;
long max_pause;
long min_pause;
- int nr_dirtied_pause;
- bool dirty_exceeded = false;
- unsigned long task_ratelimit;
- unsigned long dirty_ratelimit;
struct backing_dev_info *bdi = wb->bdi;
bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
- unsigned long start_time = jiffies;
- for (;;) {
- unsigned long now = jiffies;
- unsigned long dirty, thresh, bg_thresh;
- unsigned long m_dirty = 0; /* stop bogus uninit warnings */
- unsigned long m_thresh = 0;
- unsigned long m_bg_thresh = 0;
-
- nr_reclaimable = global_node_page_state(NR_FILE_DIRTY);
- gdtc->avail = global_dirtyable_memory();
- gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK);
+ unsigned long dirty, thresh, bg_thresh;
+ unsigned long m_dirty = 0; /* stop bogus uninit warnings */
+ unsigned long m_thresh = 0;
+ unsigned long m_bg_thresh = 0;
- domain_dirty_limits(gdtc);
+ ctx->now = jiffies;
+ ctx->nr_reclaimable = global_node_page_state(NR_FILE_DIRTY);
+ gdtc->avail = global_dirtyable_memory();
+ gdtc->dirty = ctx->nr_reclaimable + global_node_page_state(NR_WRITEBACK);
- if (unlikely(strictlimit)) {
- wb_dirty_limits(gdtc);
+ domain_dirty_limits(gdtc);
- dirty = gdtc->wb_dirty;
- thresh = gdtc->wb_thresh;
- bg_thresh = gdtc->wb_bg_thresh;
- } else {
- dirty = gdtc->dirty;
- thresh = gdtc->thresh;
- bg_thresh = gdtc->bg_thresh;
- }
+ if (unlikely(strictlimit)) {
+ wb_dirty_limits(gdtc);
- if (mdtc) {
- unsigned long filepages, headroom, writeback;
+ dirty = gdtc->wb_dirty;
+ thresh = gdtc->wb_thresh;
+ bg_thresh = gdtc->wb_bg_thresh;
+ } else {
+ dirty = gdtc->dirty;
+ thresh = gdtc->thresh;
+ bg_thresh = gdtc->bg_thresh;
+ }
- /*
- * If @wb belongs to !root memcg, repeat the same
- * basic calculations for the memcg domain.
- */
- mem_cgroup_wb_stats(wb, &filepages, &headroom,
- &mdtc->dirty, &writeback);
- mdtc->dirty += writeback;
- mdtc_calc_avail(mdtc, filepages, headroom);
-
- domain_dirty_limits(mdtc);
-
- if (unlikely(strictlimit)) {
- wb_dirty_limits(mdtc);
- m_dirty = mdtc->wb_dirty;
- m_thresh = mdtc->wb_thresh;
- m_bg_thresh = mdtc->wb_bg_thresh;
- } else {
- m_dirty = mdtc->dirty;
- m_thresh = mdtc->thresh;
- m_bg_thresh = mdtc->bg_thresh;
- }
- }
+ if (mdtc) {
+ unsigned long filepages, headroom, writeback;
/*
- * Throttle it only when the background writeback cannot
- * catch-up. This avoids (excessively) small writeouts
- * when the wb limits are ramping up in case of !strictlimit.
- *
- * In strictlimit case make decision based on the wb counters
- * and limits. Small writeouts when the wb limits are ramping
- * up are the price we consciously pay for strictlimit-ing.
- *
- * If memcg domain is in effect, @dirty should be under
- * both global and memcg freerun ceilings.
+ * If @wb belongs to !root memcg, repeat the same
+ * basic calculations for the memcg domain.
*/
- if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) &&
- (!mdtc ||
- m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) {
- unsigned long intv;
- unsigned long m_intv;
+ mem_cgroup_wb_stats(wb, &filepages, &headroom,
+ &mdtc->dirty, &writeback);
+ mdtc->dirty += writeback;
+ mdtc_calc_avail(mdtc, filepages, headroom);
-free_running:
- intv = dirty_poll_interval(dirty, thresh);
- m_intv = ULONG_MAX;
+ domain_dirty_limits(mdtc);
- current->dirty_paused_when = now;
- current->nr_dirtied = 0;
- if (mdtc)
- m_intv = dirty_poll_interval(m_dirty, m_thresh);
- current->nr_dirtied_pause = min(intv, m_intv);
- break;
+ if (unlikely(strictlimit)) {
+ wb_dirty_limits(mdtc);
+ m_dirty = mdtc->wb_dirty;
+ m_thresh = mdtc->wb_thresh;
+ m_bg_thresh = mdtc->wb_bg_thresh;
+ } else {
+ m_dirty = mdtc->dirty;
+ m_thresh = mdtc->thresh;
+ m_bg_thresh = mdtc->bg_thresh;
}
+ }
- if (unlikely(!writeback_in_progress(wb)))
- wb_start_background_writeback(wb);
+ /*
+ * Throttle it only when the background writeback cannot
+ * catch-up. This avoids (excessively) small writeouts
+ * when the wb limits are ramping up in case of !strictlimit.
+ *
+ * In strictlimit case make decision based on the wb counters
+ * and limits. Small writeouts when the wb limits are ramping
+ * up are the price we consciously pay for strictlimit-ing.
+ *
+ * If memcg domain is in effect, @dirty should be under
+ * both global and memcg freerun ceilings.
+ */
+ if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) &&
+ (!mdtc ||
+ m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) {
+ unsigned long intv;
+ unsigned long m_intv;
- mem_cgroup_flush_foreign(wb);
+free_running:
+ intv = dirty_poll_interval(dirty, thresh);
+ m_intv = ULONG_MAX;
+
+ current->dirty_paused_when = ctx->now;
+ current->nr_dirtied = 0;
+ if (mdtc)
+ m_intv = dirty_poll_interval(m_dirty, m_thresh);
+ current->nr_dirtied_pause = min(intv, m_intv);
+ return 0;
+ }
+
+ if (unlikely(!writeback_in_progress(wb)))
+ wb_start_background_writeback(wb);
+ mem_cgroup_flush_foreign(wb);
+
+ /*
+ * Calculate global domain's pos_ratio and select the
+ * global dtc by default.
+ */
+ if (!strictlimit) {
+ wb_dirty_limits(gdtc);
+
+ if ((current->flags & PF_LOCAL_THROTTLE) &&
+ gdtc->wb_dirty <
+ dirty_freerun_ceiling(gdtc->wb_thresh,
+ gdtc->wb_bg_thresh))
+ /*
+ * LOCAL_THROTTLE tasks must not be throttled
+ * when below the per-wb freerun ceiling.
+ */
+ goto free_running;
+ }
+
+ ctx->dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) &&
+ ((gdtc->dirty > gdtc->thresh) || strictlimit);
+
+ wb_position_ratio(gdtc);
+ ctx->sdtc = gdtc;
+
+ if (mdtc) {
/*
- * Calculate global domain's pos_ratio and select the
- * global dtc by default.
+ * If memcg domain is in effect, calculate its
+ * pos_ratio. @wb should satisfy constraints from
+ * both global and memcg domains. Choose the one
+ * w/ lower pos_ratio.
*/
if (!strictlimit) {
- wb_dirty_limits(gdtc);
+ wb_dirty_limits(mdtc);
if ((current->flags & PF_LOCAL_THROTTLE) &&
- gdtc->wb_dirty <
- dirty_freerun_ceiling(gdtc->wb_thresh,
- gdtc->wb_bg_thresh))
+ mdtc->wb_dirty <
+ dirty_freerun_ceiling(mdtc->wb_thresh,
+ mdtc->wb_bg_thresh))
/*
- * LOCAL_THROTTLE tasks must not be throttled
- * when below the per-wb freerun ceiling.
+ * LOCAL_THROTTLE tasks must not be
+ * throttled when below the per-wb
+ * freerun ceiling.
*/
goto free_running;
}
+ ctx->dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) &&
+ ((mdtc->dirty > mdtc->thresh) || strictlimit);
- dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) &&
- ((gdtc->dirty > gdtc->thresh) || strictlimit);
+ wb_position_ratio(mdtc);
+ if (mdtc->pos_ratio < gdtc->pos_ratio)
+ ctx->sdtc = mdtc;
+ }
- wb_position_ratio(gdtc);
- sdtc = gdtc;
+ if (ctx->dirty_exceeded && !wb->dirty_exceeded)
+ wb->dirty_exceeded = 1;
- if (mdtc) {
- /*
- * If memcg domain is in effect, calculate its
- * pos_ratio. @wb should satisfy constraints from
- * both global and memcg domains. Choose the one
- * w/ lower pos_ratio.
- */
- if (!strictlimit) {
- wb_dirty_limits(mdtc);
-
- if ((current->flags & PF_LOCAL_THROTTLE) &&
- mdtc->wb_dirty <
- dirty_freerun_ceiling(mdtc->wb_thresh,
- mdtc->wb_bg_thresh))
- /*
- * LOCAL_THROTTLE tasks must not be
- * throttled when below the per-wb
- * freerun ceiling.
- */
- goto free_running;
- }
- dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) &&
- ((mdtc->dirty > mdtc->thresh) || strictlimit);
+ if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) +
+ BANDWIDTH_INTERVAL))
+ __wb_update_bandwidth(gdtc, mdtc, true);
+
+ /* throttle according to the chosen dtc */
+ ctx->dirty_ratelimit = READ_ONCE(wb->dirty_ratelimit);
+ ctx->task_ratelimit = ((u64)ctx->dirty_ratelimit * ctx->sdtc->pos_ratio) >>
+ RATELIMIT_CALC_SHIFT;
+ max_pause = wb_max_pause(wb, ctx->sdtc->wb_dirty);
+ min_pause = wb_min_pause(wb, max_pause,
+ ctx->task_ratelimit, ctx->dirty_ratelimit,
+ &ctx->nr_dirtied_pause);
+
+ if (unlikely(ctx->task_ratelimit == 0)) {
+ period = max_pause;
+ ctx->pause = max_pause;
+ goto pause;
+ }
+ period = HZ * pages_dirtied / ctx->task_ratelimit;
+ ctx->pause = period;
+ if (current->dirty_paused_when)
+ ctx->pause -= ctx->now - current->dirty_paused_when;
+ /*
+ * For less than 1s think time (ext3/4 may block the dirtier
+ * for up to 800ms from time to time on 1-HDD; so does xfs,
+ * however at much less frequency), try to compensate it in
+ * future periods by updating the virtual time; otherwise just
+ * do a reset, as it may be a light dirtier.
+ */
+ if (ctx->pause < min_pause) {
+ trace_balance_dirty_pages(wb,
+ ctx->sdtc->thresh,
+ ctx->sdtc->bg_thresh,
+ ctx->sdtc->dirty,
+ ctx->sdtc->wb_thresh,
+ ctx->sdtc->wb_dirty,
+ ctx->dirty_ratelimit,
+ ctx->task_ratelimit,
+ pages_dirtied,
+ period,
+ min(ctx->pause, 0L),
+ ctx->start_time);
+ if (ctx->pause < -HZ) {
+ current->dirty_paused_when = ctx->now;
+ current->nr_dirtied = 0;
+ } else if (period) {
+ current->dirty_paused_when += period;
+ current->nr_dirtied = 0;
+ } else if (current->nr_dirtied_pause <= pages_dirtied)
+ current->nr_dirtied_pause += pages_dirtied;
+ return 0;
+ }
+ if (unlikely(ctx->pause > max_pause)) {
+ /* for occasional dropped task_ratelimit */
+ ctx->now += min(ctx->pause - max_pause, max_pause);
+ ctx->pause = max_pause;
+ }
- wb_position_ratio(mdtc);
- if (mdtc->pos_ratio < gdtc->pos_ratio)
- sdtc = mdtc;
- }
+pause:
+ trace_balance_dirty_pages(wb,
+ ctx->sdtc->thresh,
+ ctx->sdtc->bg_thresh,
+ ctx->sdtc->dirty,
+ ctx->sdtc->wb_thresh,
+ ctx->sdtc->wb_dirty,
+ ctx->dirty_ratelimit,
+ ctx->task_ratelimit,
+ pages_dirtied,
+ period,
+ ctx->pause,
+ ctx->start_time);
+
+ return ctx->pause;
+}
- if (dirty_exceeded && !wb->dirty_exceeded)
- wb->dirty_exceeded = 1;
-
- if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) +
- BANDWIDTH_INTERVAL))
- __wb_update_bandwidth(gdtc, mdtc, true);
-
- /* throttle according to the chosen dtc */
- dirty_ratelimit = READ_ONCE(wb->dirty_ratelimit);
- task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >>
- RATELIMIT_CALC_SHIFT;
- max_pause = wb_max_pause(wb, sdtc->wb_dirty);
- min_pause = wb_min_pause(wb, max_pause,
- task_ratelimit, dirty_ratelimit,
- &nr_dirtied_pause);
-
- if (unlikely(task_ratelimit == 0)) {
- period = max_pause;
- pause = max_pause;
- goto pause;
- }
- period = HZ * pages_dirtied / task_ratelimit;
- pause = period;
- if (current->dirty_paused_when)
- pause -= now - current->dirty_paused_when;
- /*
- * For less than 1s think time (ext3/4 may block the dirtier
- * for up to 800ms from time to time on 1-HDD; so does xfs,
- * however at much less frequency), try to compensate it in
- * future periods by updating the virtual time; otherwise just
- * do a reset, as it may be a light dirtier.
- */
- if (pause < min_pause) {
- trace_balance_dirty_pages(wb,
- sdtc->thresh,
- sdtc->bg_thresh,
- sdtc->dirty,
- sdtc->wb_thresh,
- sdtc->wb_dirty,
- dirty_ratelimit,
- task_ratelimit,
- pages_dirtied,
- period,
- min(pause, 0L),
- start_time);
- if (pause < -HZ) {
- current->dirty_paused_when = now;
- current->nr_dirtied = 0;
- } else if (period) {
- current->dirty_paused_when += period;
- current->nr_dirtied = 0;
- } else if (current->nr_dirtied_pause <= pages_dirtied)
- current->nr_dirtied_pause += pages_dirtied;
+/*
+ * balance_dirty_pages() must be called by processes which are generating dirty
+ * data. It looks at the number of dirty pages in the machine and will force
+ * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2.
+ * If we're over `background_thresh' then the writeback threads are woken to
+ * perform some writeout.
+ */
+static void balance_dirty_pages(struct bdi_writeback *wb, unsigned long pages_dirtied)
+{
+ int ret;
+ struct bdp_ctx ctx = { BDP_CTX_INIT(ctx, wb) };
+
+ for (;;) {
+ ret = _balance_dirty_pages(wb, current->nr_dirtied, &ctx);
+ if (!ret)
break;
- }
- if (unlikely(pause > max_pause)) {
- /* for occasional dropped task_ratelimit */
- now += min(pause - max_pause, max_pause);
- pause = max_pause;
- }
-pause:
- trace_balance_dirty_pages(wb,
- sdtc->thresh,
- sdtc->bg_thresh,
- sdtc->dirty,
- sdtc->wb_thresh,
- sdtc->wb_dirty,
- dirty_ratelimit,
- task_ratelimit,
- pages_dirtied,
- period,
- pause,
- start_time);
__set_current_state(TASK_KILLABLE);
- wb->dirty_sleep = now;
- io_schedule_timeout(pause);
+ wb->dirty_sleep = ctx.now;
+ io_schedule_timeout(ctx.pause);
- current->dirty_paused_when = now + pause;
+ current->dirty_paused_when = ctx.now + ctx.pause;
current->nr_dirtied = 0;
- current->nr_dirtied_pause = nr_dirtied_pause;
+ current->nr_dirtied_pause = ctx.nr_dirtied_pause;
/*
* This is typically equal to (dirty < thresh) and can also
* keep "1000+ dd on a slow USB stick" under control.
*/
- if (task_ratelimit)
+ if (ctx.task_ratelimit)
break;
/*
@@ -1805,14 +1831,14 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
* more page. However wb_dirty has accounting errors. So use
* the larger and more IO friendly wb_stat_error.
*/
- if (sdtc->wb_dirty <= wb_stat_error())
+ if (ctx.sdtc->wb_dirty <= wb_stat_error())
break;
if (fatal_signal_pending(current))
break;
}
- if (!dirty_exceeded && wb->dirty_exceeded)
+ if (!ctx.dirty_exceeded && wb->dirty_exceeded)
wb->dirty_exceeded = 0;
if (writeback_in_progress(wb))
@@ -1829,7 +1855,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
if (laptop_mode)
return;
- if (nr_reclaimable > gdtc->bg_thresh)
+ if (ctx.nr_reclaimable > ctx.gdtc_stor.bg_thresh)
wb_start_background_writeback(wb);
}
--
2.30.2
^ permalink raw reply related [flat|nested] 22+ messages in thread
* [RFC PATCH v2 13/16] mm: add balance_dirty_pages_ratelimited_flags() function
2022-05-16 16:47 [RFC PATCH v2 00/16] io-uring/xfs: support async buffered writes Stefan Roesch
` (11 preceding siblings ...)
2022-05-16 16:47 ` [RFC PATCH v2 12/16] mm: factor out _balance_dirty_pages() from balance_dirty_pages() Stefan Roesch
@ 2022-05-16 16:47 ` Stefan Roesch
2022-05-16 16:47 ` [RFC PATCH v2 14/16] iomap: use balance_dirty_pages_ratelimited_flags in iomap_write_iter Stefan Roesch
` (2 subsequent siblings)
15 siblings, 0 replies; 22+ messages in thread
From: Stefan Roesch @ 2022-05-16 16:47 UTC (permalink / raw)
To: io-uring, kernel-team, linux-mm, linux-xfs, linux-fsdevel
Cc: shr, david, jack
This adds the function balance_dirty_pages_ratelimited_flags(). It adds
the parameter is_async to balance_dirty_pages_ratelimited(). In case
this is an async write, it will call _balance_diirty_pages() to
determine if write throttling needs to be enabled. If write throttling
is enabled, it retuns -EAGAIN, so the write request can be punted to
the io-uring worker.
The new function is changed to return the sleep time, so callers can
observe if the write has been punted.
For non-async writes the current behavior is maintained.
Signed-off-by: Stefan Roesch <[email protected]>
---
include/linux/writeback.h | 1 +
mm/page-writeback.c | 48 ++++++++++++++++++++++++++-------------
2 files changed, 33 insertions(+), 16 deletions(-)
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index fec248ab1fec..d589804bb3be 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -373,6 +373,7 @@ unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh);
void wb_update_bandwidth(struct bdi_writeback *wb);
void balance_dirty_pages_ratelimited(struct address_space *mapping);
+int balance_dirty_pages_ratelimited_flags(struct address_space *mapping, bool is_async);
bool wb_over_bg_thresh(struct bdi_writeback *wb);
typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index cbb74c0666c6..78f1326f3f20 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1877,28 +1877,17 @@ static DEFINE_PER_CPU(int, bdp_ratelimits);
*/
DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
-/**
- * balance_dirty_pages_ratelimited - balance dirty memory state
- * @mapping: address_space which was dirtied
- *
- * Processes which are dirtying memory should call in here once for each page
- * which was newly dirtied. The function will periodically check the system's
- * dirty state and will initiate writeback if needed.
- *
- * Once we're over the dirty memory limit we decrease the ratelimiting
- * by a lot, to prevent individual processes from overshooting the limit
- * by (ratelimit_pages) each.
- */
-void balance_dirty_pages_ratelimited(struct address_space *mapping)
+int balance_dirty_pages_ratelimited_flags(struct address_space *mapping, bool is_async)
{
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode);
struct bdi_writeback *wb = NULL;
int ratelimit;
+ int ret = 0;
int *p;
if (!(bdi->capabilities & BDI_CAP_WRITEBACK))
- return;
+ return ret;
if (inode_cgwb_enabled(inode))
wb = wb_get_create_current(bdi, GFP_KERNEL);
@@ -1937,10 +1926,37 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping)
}
preempt_enable();
- if (unlikely(current->nr_dirtied >= ratelimit))
- balance_dirty_pages(wb, current->nr_dirtied);
+ if (unlikely(current->nr_dirtied >= ratelimit)) {
+ if (is_async) {
+ struct bdp_ctx ctx = { BDP_CTX_INIT(ctx, wb) };
+
+ ret = _balance_dirty_pages(wb, current->nr_dirtied, &ctx);
+ if (ret)
+ ret = -EAGAIN;
+ } else {
+ balance_dirty_pages(wb, current->nr_dirtied);
+ }
+ }
wb_put(wb);
+ return ret;
+}
+
+/**
+ * balance_dirty_pages_ratelimited - balance dirty memory state
+ * @mapping: address_space which was dirtied
+ *
+ * Processes which are dirtying memory should call in here once for each page
+ * which was newly dirtied. The function will periodically check the system's
+ * dirty state and will initiate writeback if needed.
+ *
+ * Once we're over the dirty memory limit we decrease the ratelimiting
+ * by a lot, to prevent individual processes from overshooting the limit
+ * by (ratelimit_pages) each.
+ */
+void balance_dirty_pages_ratelimited(struct address_space *mapping)
+{
+ balance_dirty_pages_ratelimited_flags(mapping, false);
}
EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
--
2.30.2
^ permalink raw reply related [flat|nested] 22+ messages in thread
* [RFC PATCH v2 14/16] iomap: use balance_dirty_pages_ratelimited_flags in iomap_write_iter
2022-05-16 16:47 [RFC PATCH v2 00/16] io-uring/xfs: support async buffered writes Stefan Roesch
` (12 preceding siblings ...)
2022-05-16 16:47 ` [RFC PATCH v2 13/16] mm: add balance_dirty_pages_ratelimited_flags() function Stefan Roesch
@ 2022-05-16 16:47 ` Stefan Roesch
2022-05-16 16:47 ` [RFC PATCH v2 15/16] io_uring: add tracepoint for short writes Stefan Roesch
2022-05-16 16:47 ` [RFC PATCH v2 16/16] xfs: enable async buffered write support Stefan Roesch
15 siblings, 0 replies; 22+ messages in thread
From: Stefan Roesch @ 2022-05-16 16:47 UTC (permalink / raw)
To: io-uring, kernel-team, linux-mm, linux-xfs, linux-fsdevel
Cc: shr, david, jack
This replaces the call to balance_dirty_pages_ratelimited() with the
call to balance_dirty_pages_ratelimited_flags. This allows to specify if
the write request is async or not.
In addition this also moves the above function call to the beginning of
the function. If the function call is at the end of the function and the
decision is made to throttle writes, then there is no request that
io-uring can wait on. By moving it to the beginning of the function, the
write request is not issued, but returns -EAGAIN instead. io-uring will
punt the request and process it in the io-worker.
By moving the function call to the beginning of the function, the write
throttling will happen one page later.
Signed-off-by: Stefan Roesch <[email protected]>
---
fs/iomap/buffered-io.c | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index ceb3091f94c2..41a8e0bb2edd 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -794,6 +794,11 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
bytes = min_t(unsigned long, PAGE_SIZE - offset,
iov_iter_count(i));
again:
+ status = balance_dirty_pages_ratelimited_flags(iter->inode->i_mapping,
+ (iter->flags & IOMAP_NOWAIT));
+ if (unlikely(status))
+ break;
+
if (bytes > length)
bytes = length;
@@ -842,8 +847,6 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
pos += status;
written += status;
length -= status;
-
- balance_dirty_pages_ratelimited(iter->inode->i_mapping);
} while (iov_iter_count(i) && length);
return written ? written : status;
--
2.30.2
^ permalink raw reply related [flat|nested] 22+ messages in thread
* [RFC PATCH v2 15/16] io_uring: add tracepoint for short writes
2022-05-16 16:47 [RFC PATCH v2 00/16] io-uring/xfs: support async buffered writes Stefan Roesch
` (13 preceding siblings ...)
2022-05-16 16:47 ` [RFC PATCH v2 14/16] iomap: use balance_dirty_pages_ratelimited_flags in iomap_write_iter Stefan Roesch
@ 2022-05-16 16:47 ` Stefan Roesch
2022-05-16 16:47 ` [RFC PATCH v2 16/16] xfs: enable async buffered write support Stefan Roesch
15 siblings, 0 replies; 22+ messages in thread
From: Stefan Roesch @ 2022-05-16 16:47 UTC (permalink / raw)
To: io-uring, kernel-team, linux-mm, linux-xfs, linux-fsdevel
Cc: shr, david, jack
This adds the io_uring_short_write tracepoint to io_uring. A short write
is issued if not all pages that are required for a write are in the page
cache and the async buffered writes have to return EAGAIN.
Signed-off-by: Stefan Roesch <[email protected]>
---
fs/io_uring.c | 3 +++
include/trace/events/io_uring.h | 25 +++++++++++++++++++++++++
2 files changed, 28 insertions(+)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index f3aaac286509..7435a9c2007f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4032,6 +4032,9 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
if (ret2 != req->result && ret2 >= 0 && need_complete_io(req)) {
struct io_async_rw *rw;
+ trace_io_uring_short_write(req->ctx, kiocb->ki_pos - ret2,
+ req->result, ret2);
+
/* This is a partial write. The file pos has already been
* updated, setup the async struct to complete the request
* in the worker. Also update bytes_done to account for
diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
index cddf5b6fbeb4..661834361d33 100644
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -543,6 +543,31 @@ TRACE_EVENT(io_uring_req_failed,
(unsigned long long) __entry->pad2, __entry->error)
);
+TRACE_EVENT(io_uring_short_write,
+
+ TP_PROTO(void *ctx, u64 fpos, u64 wanted, u64 got),
+
+ TP_ARGS(ctx, fpos, wanted, got),
+
+ TP_STRUCT__entry(
+ __field(void *, ctx)
+ __field(u64, fpos)
+ __field(u64, wanted)
+ __field(u64, got)
+ ),
+
+ TP_fast_assign(
+ __entry->ctx = ctx;
+ __entry->fpos = fpos;
+ __entry->wanted = wanted;
+ __entry->got = got;
+ ),
+
+ TP_printk("ring %p, fpos %lld, wanted %lld, got %lld",
+ __entry->ctx, __entry->fpos,
+ __entry->wanted, __entry->got)
+);
+
#endif /* _TRACE_IO_URING_H */
/* This part must be outside protection */
--
2.30.2
^ permalink raw reply related [flat|nested] 22+ messages in thread
* [RFC PATCH v2 16/16] xfs: enable async buffered write support
2022-05-16 16:47 [RFC PATCH v2 00/16] io-uring/xfs: support async buffered writes Stefan Roesch
` (14 preceding siblings ...)
2022-05-16 16:47 ` [RFC PATCH v2 15/16] io_uring: add tracepoint for short writes Stefan Roesch
@ 2022-05-16 16:47 ` Stefan Roesch
15 siblings, 0 replies; 22+ messages in thread
From: Stefan Roesch @ 2022-05-16 16:47 UTC (permalink / raw)
To: io-uring, kernel-team, linux-mm, linux-xfs, linux-fsdevel
Cc: shr, david, jack
This turns on the async buffered write support for XFS.
Signed-off-by: Stefan Roesch <[email protected]>
---
fs/xfs/xfs_file.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index ad3175b7d366..af4fdc852da5 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1169,7 +1169,7 @@ xfs_file_open(
return -EFBIG;
if (xfs_is_shutdown(XFS_M(inode->i_sb)))
return -EIO;
- file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
+ file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC;
return 0;
}
--
2.30.2
^ permalink raw reply related [flat|nested] 22+ messages in thread