* [PATCH 1/5] btrfs: remove pointless addition in btrfs_encoded_read
2024-10-14 17:18 [PATCH v3 0/5] btrfs: encoded reads via io_uring Mark Harmstone
@ 2024-10-14 17:18 ` Mark Harmstone
2024-10-14 17:18 ` [PATCH 2/5] btrfs: change btrfs_encoded_read_regular_fill_pages to take a callback Mark Harmstone
` (4 subsequent siblings)
5 siblings, 0 replies; 17+ messages in thread
From: Mark Harmstone @ 2024-10-14 17:18 UTC (permalink / raw)
To: linux-btrfs, io-uring; +Cc: Mark Harmstone
iocb->ki_pos isn't used after this function, so there's no point in
changing its value.
Signed-off-by: Mark Harmstone <[email protected]>
---
fs/btrfs/inode.c | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b1b6564ab68f..b024ebc3dcd6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9280,7 +9280,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
ret = btrfs_encoded_read_inline(iocb, iter, start, lockend,
&cached_state, extent_start,
count, encoded, &unlocked);
- goto out;
+ goto out_em;
}
/*
@@ -9346,9 +9346,6 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
&unlocked);
}
-out:
- if (ret >= 0)
- iocb->ki_pos += encoded->len;
out_em:
free_extent_map(em);
out_unlock_extent:
--
2.44.2
^ permalink raw reply related [flat|nested] 17+ messages in thread
* [PATCH 2/5] btrfs: change btrfs_encoded_read_regular_fill_pages to take a callback
2024-10-14 17:18 [PATCH v3 0/5] btrfs: encoded reads via io_uring Mark Harmstone
2024-10-14 17:18 ` [PATCH 1/5] btrfs: remove pointless addition in btrfs_encoded_read Mark Harmstone
@ 2024-10-14 17:18 ` Mark Harmstone
2024-10-15 15:23 ` David Sterba
2024-10-21 13:21 ` David Sterba
2024-10-14 17:18 ` [PATCH 3/5] btrfs: change btrfs_encoded_read so that reading of extent is done by caller Mark Harmstone
` (3 subsequent siblings)
5 siblings, 2 replies; 17+ messages in thread
From: Mark Harmstone @ 2024-10-14 17:18 UTC (permalink / raw)
To: linux-btrfs, io-uring; +Cc: Mark Harmstone
Change btrfs_encoded_read_regular_fill_pages so that it takes a callback
rather than waiting, and add new helper function btrfs_encoded_read_wait_cb
to match the existing behaviour.
Signed-off-by: Mark Harmstone <[email protected]>
---
fs/btrfs/btrfs_inode.h | 13 +++++++-
fs/btrfs/inode.c | 70 ++++++++++++++++++++++++++++++++----------
fs/btrfs/send.c | 15 ++++++++-
3 files changed, 79 insertions(+), 19 deletions(-)
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 3056c8aed8ef..6aea5bedc968 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -601,10 +601,21 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page
int btrfs_writepage_cow_fixup(struct page *page);
int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
int compress_type);
+typedef void (btrfs_encoded_read_cb_t)(void *, int);
+
+struct btrfs_encoded_read_wait_ctx {
+ wait_queue_head_t wait;
+ bool done;
+ int err;
+};
+
+void btrfs_encoded_read_wait_cb(void *ctx, int err);
int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
u64 file_offset, u64 disk_bytenr,
u64 disk_io_size,
- struct page **pages);
+ struct page **pages,
+ btrfs_encoded_read_cb_t cb,
+ void *cb_ctx);
ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
struct btrfs_ioctl_encoded_io_args *encoded);
ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b024ebc3dcd6..b5abe98f3af4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9080,9 +9080,10 @@ static ssize_t btrfs_encoded_read_inline(
}
struct btrfs_encoded_read_private {
- wait_queue_head_t wait;
atomic_t pending;
blk_status_t status;
+ btrfs_encoded_read_cb_t *cb;
+ void *cb_ctx;
};
static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
@@ -9100,26 +9101,33 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
*/
WRITE_ONCE(priv->status, bbio->bio.bi_status);
}
- if (!atomic_dec_return(&priv->pending))
- wake_up(&priv->wait);
+ if (!atomic_dec_return(&priv->pending)) {
+ priv->cb(priv->cb_ctx,
+ blk_status_to_errno(READ_ONCE(priv->status)));
+ kfree(priv);
+ }
bio_put(&bbio->bio);
}
int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
u64 file_offset, u64 disk_bytenr,
- u64 disk_io_size, struct page **pages)
+ u64 disk_io_size, struct page **pages,
+ btrfs_encoded_read_cb_t cb,
+ void *cb_ctx)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct btrfs_encoded_read_private priv = {
- .pending = ATOMIC_INIT(1),
- };
+ struct btrfs_encoded_read_private *priv;
unsigned long i = 0;
struct btrfs_bio *bbio;
- init_waitqueue_head(&priv.wait);
+ priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS);
+ if (!priv)
+ return -ENOMEM;
+
+ atomic_set(&priv->pending, 1);
bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
- btrfs_encoded_read_endio, &priv);
+ btrfs_encoded_read_endio, priv);
bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
bbio->inode = inode;
@@ -9127,11 +9135,11 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE);
if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) {
- atomic_inc(&priv.pending);
+ atomic_inc(&priv->pending);
btrfs_submit_bio(bbio, 0);
bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
- btrfs_encoded_read_endio, &priv);
+ btrfs_encoded_read_endio, priv);
bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
bbio->inode = inode;
continue;
@@ -9142,13 +9150,28 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
disk_io_size -= bytes;
} while (disk_io_size);
- atomic_inc(&priv.pending);
+ atomic_inc(&priv->pending);
+ priv->cb = cb;
+ priv->cb_ctx = cb_ctx;
+
btrfs_submit_bio(bbio, 0);
- if (atomic_dec_return(&priv.pending))
- io_wait_event(priv.wait, !atomic_read(&priv.pending));
- /* See btrfs_encoded_read_endio() for ordering. */
- return blk_status_to_errno(READ_ONCE(priv.status));
+ if (!atomic_dec_return(&priv->pending)) {
+ cb(cb_ctx, blk_status_to_errno(READ_ONCE(priv->status)));
+ kfree(priv);
+ }
+
+ return 0;
+}
+
+void btrfs_encoded_read_wait_cb(void *ctx, int err)
+{
+ struct btrfs_encoded_read_wait_ctx *priv = ctx;
+
+ priv->err = err;
+ priv->done = true;
+
+ wake_up(&priv->wait);
}
static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
@@ -9166,6 +9189,7 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
u64 cur;
size_t page_offset;
ssize_t ret;
+ struct btrfs_encoded_read_wait_ctx wait_ctx;
nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
@@ -9177,11 +9201,23 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
goto out;
}
+ wait_ctx.done = false;
+ init_waitqueue_head(&wait_ctx.wait);
+
ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
- disk_io_size, pages);
+ disk_io_size, pages,
+ btrfs_encoded_read_wait_cb,
+ &wait_ctx);
if (ret)
goto out;
+ io_wait_event(wait_ctx.wait, wait_ctx.done);
+
+ if (wait_ctx.err) {
+ ret = wait_ctx.err;
+ goto out;
+ }
+
unlock_extent(io_tree, start, lockend, cached_state);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
*unlocked = true;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 619fa0b8b3f6..52f653c6671e 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -5613,6 +5613,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
u64 disk_bytenr, disk_num_bytes;
u32 data_offset;
struct btrfs_cmd_header *hdr;
+ struct btrfs_encoded_read_wait_ctx wait_ctx;
u32 crc;
int ret;
@@ -5671,6 +5672,9 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
goto out;
}
+ wait_ctx.done = false;
+ init_waitqueue_head(&wait_ctx.wait);
+
/*
* Note that send_buf is a mapping of send_buf_pages, so this is really
* reading into send_buf.
@@ -5678,10 +5682,19 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode), offset,
disk_bytenr, disk_num_bytes,
sctx->send_buf_pages +
- (data_offset >> PAGE_SHIFT));
+ (data_offset >> PAGE_SHIFT),
+ btrfs_encoded_read_wait_cb,
+ &wait_ctx);
if (ret)
goto out;
+ io_wait_event(wait_ctx.wait, wait_ctx.done);
+
+ if (wait_ctx.err) {
+ ret = wait_ctx.err;
+ goto out;
+ }
+
hdr = (struct btrfs_cmd_header *)sctx->send_buf;
hdr->len = cpu_to_le32(sctx->send_size + disk_num_bytes - sizeof(*hdr));
hdr->crc = 0;
--
2.44.2
^ permalink raw reply related [flat|nested] 17+ messages in thread
* Re: [PATCH 2/5] btrfs: change btrfs_encoded_read_regular_fill_pages to take a callback
2024-10-14 17:18 ` [PATCH 2/5] btrfs: change btrfs_encoded_read_regular_fill_pages to take a callback Mark Harmstone
@ 2024-10-15 15:23 ` David Sterba
2024-10-21 13:21 ` David Sterba
1 sibling, 0 replies; 17+ messages in thread
From: David Sterba @ 2024-10-15 15:23 UTC (permalink / raw)
To: Mark Harmstone; +Cc: linux-btrfs, io-uring
On Mon, Oct 14, 2024 at 06:18:24PM +0100, Mark Harmstone wrote:
> Change btrfs_encoded_read_regular_fill_pages so that it takes a callback
> rather than waiting, and add new helper function btrfs_encoded_read_wait_cb
> to match the existing behaviour.
Please avoid callbacks and function pointers when it's in the same
subsystem. Pass some enum and do a switch inside the function, or wrap
it to a helper if needed. Since spectre/meltdown function pointers in
kernel have been removed when possible and I don't see a strong reason
for it here.
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH 2/5] btrfs: change btrfs_encoded_read_regular_fill_pages to take a callback
2024-10-14 17:18 ` [PATCH 2/5] btrfs: change btrfs_encoded_read_regular_fill_pages to take a callback Mark Harmstone
2024-10-15 15:23 ` David Sterba
@ 2024-10-21 13:21 ` David Sterba
1 sibling, 0 replies; 17+ messages in thread
From: David Sterba @ 2024-10-21 13:21 UTC (permalink / raw)
To: Mark Harmstone; +Cc: linux-btrfs, io-uring
On Mon, Oct 14, 2024 at 06:18:24PM +0100, Mark Harmstone wrote:
> Change btrfs_encoded_read_regular_fill_pages so that it takes a callback
> rather than waiting, and add new helper function btrfs_encoded_read_wait_cb
> to match the existing behaviour.
>
> Signed-off-by: Mark Harmstone <[email protected]>
> ---
> fs/btrfs/btrfs_inode.h | 13 +++++++-
> fs/btrfs/inode.c | 70 ++++++++++++++++++++++++++++++++----------
> fs/btrfs/send.c | 15 ++++++++-
> 3 files changed, 79 insertions(+), 19 deletions(-)
>
> diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
> index 3056c8aed8ef..6aea5bedc968 100644
> --- a/fs/btrfs/btrfs_inode.h
> +++ b/fs/btrfs/btrfs_inode.h
> @@ -601,10 +601,21 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page
> int btrfs_writepage_cow_fixup(struct page *page);
> int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
> int compress_type);
> +typedef void (btrfs_encoded_read_cb_t)(void *, int);
> +
> +struct btrfs_encoded_read_wait_ctx {
> + wait_queue_head_t wait;
> + bool done;
> + int err;
Please reorder that so it does not waste the bytes after 'done' to align
'err'
> +};
> +
> +void btrfs_encoded_read_wait_cb(void *ctx, int err);
> int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
> u64 file_offset, u64 disk_bytenr,
> u64 disk_io_size,
> - struct page **pages);
> + struct page **pages,
> + btrfs_encoded_read_cb_t cb,
> + void *cb_ctx);
> ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
> struct btrfs_ioctl_encoded_io_args *encoded);
> ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index b024ebc3dcd6..b5abe98f3af4 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -9080,9 +9080,10 @@ static ssize_t btrfs_encoded_read_inline(
> }
>
> struct btrfs_encoded_read_private {
> - wait_queue_head_t wait;
> atomic_t pending;
> blk_status_t status;
> + btrfs_encoded_read_cb_t *cb;
> + void *cb_ctx;
Final version of this structure could be also reordered so it does not
leave unnecessary holes, I think status is u8 so it now fills the hole
after pending but I'm not sure now if other patches make more changes.
> };
>
> static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
> @@ -9100,26 +9101,33 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
> */
> WRITE_ONCE(priv->status, bbio->bio.bi_status);
> }
> - if (!atomic_dec_return(&priv->pending))
> - wake_up(&priv->wait);
> + if (!atomic_dec_return(&priv->pending)) {
Though it's in the original code, please rewrite the condition so it
reads as an arithmetic condition, "== 0".
> + priv->cb(priv->cb_ctx,
> + blk_status_to_errno(READ_ONCE(priv->status)));
> + kfree(priv);
> + }
> bio_put(&bbio->bio);
> }
^ permalink raw reply [flat|nested] 17+ messages in thread
* [PATCH 3/5] btrfs: change btrfs_encoded_read so that reading of extent is done by caller
2024-10-14 17:18 [PATCH v3 0/5] btrfs: encoded reads via io_uring Mark Harmstone
2024-10-14 17:18 ` [PATCH 1/5] btrfs: remove pointless addition in btrfs_encoded_read Mark Harmstone
2024-10-14 17:18 ` [PATCH 2/5] btrfs: change btrfs_encoded_read_regular_fill_pages to take a callback Mark Harmstone
@ 2024-10-14 17:18 ` Mark Harmstone
2024-10-14 17:18 ` [PATCH 4/5] btrfs: add nowait parameter to btrfs_encoded_read Mark Harmstone
` (2 subsequent siblings)
5 siblings, 0 replies; 17+ messages in thread
From: Mark Harmstone @ 2024-10-14 17:18 UTC (permalink / raw)
To: linux-btrfs, io-uring; +Cc: Mark Harmstone
Change the behaviour of btrfs_encoded_read so that if it needs to read
an extent from disk, it leaves the extent and inode locked and returns
-EIOCBQUEUED. The caller is then responsible for doing the I/O via
btrfs_encoded_read_regular and unlocking the extent and inode.
Signed-off-by: Mark Harmstone <[email protected]>
---
fs/btrfs/btrfs_inode.h | 10 +++++++-
fs/btrfs/inode.c | 58 ++++++++++++++++++++----------------------
fs/btrfs/ioctl.c | 33 +++++++++++++++++++++++-
3 files changed, 69 insertions(+), 32 deletions(-)
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 6aea5bedc968..3a0b519c51ca 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -617,7 +617,15 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
btrfs_encoded_read_cb_t cb,
void *cb_ctx);
ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
- struct btrfs_ioctl_encoded_io_args *encoded);
+ struct btrfs_ioctl_encoded_io_args *encoded,
+ struct extent_state **cached_state,
+ u64 *disk_bytenr, u64 *disk_io_size);
+ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter,
+ u64 start, u64 lockend,
+ struct extent_state **cached_state,
+ u64 disk_bytenr, u64 disk_io_size,
+ size_t count, bool compressed,
+ bool *unlocked);
ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
const struct btrfs_ioctl_encoded_io_args *encoded);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b5abe98f3af4..e58284d33b35 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9174,13 +9174,12 @@ void btrfs_encoded_read_wait_cb(void *ctx, int err)
wake_up(&priv->wait);
}
-static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
- struct iov_iter *iter,
- u64 start, u64 lockend,
- struct extent_state **cached_state,
- u64 disk_bytenr, u64 disk_io_size,
- size_t count, bool compressed,
- bool *unlocked)
+ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter,
+ u64 start, u64 lockend,
+ struct extent_state **cached_state,
+ u64 disk_bytenr, u64 disk_io_size,
+ size_t count, bool compressed,
+ bool *unlocked)
{
struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
struct extent_io_tree *io_tree = &inode->io_tree;
@@ -9254,15 +9253,16 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
}
ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
- struct btrfs_ioctl_encoded_io_args *encoded)
+ struct btrfs_ioctl_encoded_io_args *encoded,
+ struct extent_state **cached_state,
+ u64 *disk_bytenr, u64 *disk_io_size)
{
struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_io_tree *io_tree = &inode->io_tree;
ssize_t ret;
size_t count = iov_iter_count(iter);
- u64 start, lockend, disk_bytenr, disk_io_size;
- struct extent_state *cached_state = NULL;
+ u64 start, lockend;
struct extent_map *em;
bool unlocked = false;
@@ -9288,13 +9288,13 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
lockend - start + 1);
if (ret)
goto out_unlock_inode;
- lock_extent(io_tree, start, lockend, &cached_state);
+ lock_extent(io_tree, start, lockend, cached_state);
ordered = btrfs_lookup_ordered_range(inode, start,
lockend - start + 1);
if (!ordered)
break;
btrfs_put_ordered_extent(ordered);
- unlock_extent(io_tree, start, lockend, &cached_state);
+ unlock_extent(io_tree, start, lockend, cached_state);
cond_resched();
}
@@ -9314,7 +9314,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
free_extent_map(em);
em = NULL;
ret = btrfs_encoded_read_inline(iocb, iter, start, lockend,
- &cached_state, extent_start,
+ cached_state, extent_start,
count, encoded, &unlocked);
goto out_em;
}
@@ -9327,12 +9327,12 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
inode->vfs_inode.i_size) - iocb->ki_pos;
if (em->disk_bytenr == EXTENT_MAP_HOLE ||
(em->flags & EXTENT_FLAG_PREALLOC)) {
- disk_bytenr = EXTENT_MAP_HOLE;
+ *disk_bytenr = EXTENT_MAP_HOLE;
count = min_t(u64, count, encoded->len);
encoded->len = count;
encoded->unencoded_len = count;
} else if (extent_map_is_compressed(em)) {
- disk_bytenr = em->disk_bytenr;
+ *disk_bytenr = em->disk_bytenr;
/*
* Bail if the buffer isn't large enough to return the whole
* compressed extent.
@@ -9341,7 +9341,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
ret = -ENOBUFS;
goto out_em;
}
- disk_io_size = em->disk_num_bytes;
+ *disk_io_size = em->disk_num_bytes;
count = em->disk_num_bytes;
encoded->unencoded_len = em->ram_bytes;
encoded->unencoded_offset = iocb->ki_pos - (em->start - em->offset);
@@ -9351,44 +9351,42 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
goto out_em;
encoded->compression = ret;
} else {
- disk_bytenr = extent_map_block_start(em) + (start - em->start);
+ *disk_bytenr = extent_map_block_start(em) + (start - em->start);
if (encoded->len > count)
encoded->len = count;
/*
* Don't read beyond what we locked. This also limits the page
* allocations that we'll do.
*/
- disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start;
- count = start + disk_io_size - iocb->ki_pos;
+ *disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start;
+ count = start + *disk_io_size - iocb->ki_pos;
encoded->len = count;
encoded->unencoded_len = count;
- disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize);
+ *disk_io_size = ALIGN(*disk_io_size, fs_info->sectorsize);
}
free_extent_map(em);
em = NULL;
- if (disk_bytenr == EXTENT_MAP_HOLE) {
- unlock_extent(io_tree, start, lockend, &cached_state);
+ if (*disk_bytenr == EXTENT_MAP_HOLE) {
+ unlock_extent(io_tree, start, lockend, cached_state);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
unlocked = true;
ret = iov_iter_zero(count, iter);
if (ret != count)
ret = -EFAULT;
} else {
- ret = btrfs_encoded_read_regular(iocb, iter, start, lockend,
- &cached_state, disk_bytenr,
- disk_io_size, count,
- encoded->compression,
- &unlocked);
+ ret = -EIOCBQUEUED;
+ goto out_em;
}
out_em:
free_extent_map(em);
out_unlock_extent:
- if (!unlocked)
- unlock_extent(io_tree, start, lockend, &cached_state);
+ /* Leave inode and extent locked if we need to do a read */
+ if (!unlocked && ret != -EIOCBQUEUED)
+ unlock_extent(io_tree, start, lockend, cached_state);
out_unlock_inode:
- if (!unlocked)
+ if (!unlocked && ret != -EIOCBQUEUED)
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
return ret;
}
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index e0a664b8a46a..e2ecf0bcda24 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4516,12 +4516,17 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp,
size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args,
flags);
size_t copy_end;
+ struct btrfs_inode *inode = BTRFS_I(file_inode(file));
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_io_tree *io_tree = &inode->io_tree;
struct iovec iovstack[UIO_FASTIOV];
struct iovec *iov = iovstack;
struct iov_iter iter;
loff_t pos;
struct kiocb kiocb;
ssize_t ret;
+ u64 disk_bytenr, disk_io_size;
+ struct extent_state *cached_state = NULL;
if (!capable(CAP_SYS_ADMIN)) {
ret = -EPERM;
@@ -4574,7 +4579,33 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp,
init_sync_kiocb(&kiocb, file);
kiocb.ki_pos = pos;
- ret = btrfs_encoded_read(&kiocb, &iter, &args);
+ ret = btrfs_encoded_read(&kiocb, &iter, &args, &cached_state,
+ &disk_bytenr, &disk_io_size);
+
+ if (ret == -EIOCBQUEUED) {
+ bool unlocked = false;
+ u64 start, lockend, count;
+
+ start = ALIGN_DOWN(kiocb.ki_pos, fs_info->sectorsize);
+ lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
+
+ if (args.compression)
+ count = disk_io_size;
+ else
+ count = args.len;
+
+ ret = btrfs_encoded_read_regular(&kiocb, &iter, start, lockend,
+ &cached_state, disk_bytenr,
+ disk_io_size, count,
+ args.compression,
+ &unlocked);
+
+ if (!unlocked) {
+ unlock_extent(io_tree, start, lockend, &cached_state);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+ }
+ }
+
if (ret >= 0) {
fsnotify_access(file);
if (copy_to_user(argp + copy_end,
--
2.44.2
^ permalink raw reply related [flat|nested] 17+ messages in thread
* [PATCH 4/5] btrfs: add nowait parameter to btrfs_encoded_read
2024-10-14 17:18 [PATCH v3 0/5] btrfs: encoded reads via io_uring Mark Harmstone
` (2 preceding siblings ...)
2024-10-14 17:18 ` [PATCH 3/5] btrfs: change btrfs_encoded_read so that reading of extent is done by caller Mark Harmstone
@ 2024-10-14 17:18 ` Mark Harmstone
2024-10-14 22:12 ` Jens Axboe
2024-10-14 17:18 ` [PATCH 5/5] btrfs: add io_uring command for encoded reads Mark Harmstone
2024-10-14 17:44 ` [PATCH v3 0/5] btrfs: encoded reads via io_uring Boris Burkov
5 siblings, 1 reply; 17+ messages in thread
From: Mark Harmstone @ 2024-10-14 17:18 UTC (permalink / raw)
To: linux-btrfs, io-uring; +Cc: Mark Harmstone
Adds a nowait parameter to btrfs_encoded_read, which if it is true
causes the function to return -EAGAIN rather than sleeping.
Signed-off-by: Mark Harmstone <[email protected]>
---
fs/btrfs/btrfs_inode.h | 2 +-
fs/btrfs/inode.c | 59 ++++++++++++++++++++++++++++++++----------
fs/btrfs/ioctl.c | 2 +-
3 files changed, 47 insertions(+), 16 deletions(-)
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 3a0b519c51ca..2334961e71ed 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -618,7 +618,7 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
void *cb_ctx);
ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
struct btrfs_ioctl_encoded_io_args *encoded,
- struct extent_state **cached_state,
+ bool nowait, struct extent_state **cached_state,
u64 *disk_bytenr, u64 *disk_io_size);
ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter,
u64 start, u64 lockend,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e58284d33b35..c536e37cb6b6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8999,7 +8999,7 @@ static ssize_t btrfs_encoded_read_inline(
struct extent_state **cached_state,
u64 extent_start, size_t count,
struct btrfs_ioctl_encoded_io_args *encoded,
- bool *unlocked)
+ bool *unlocked, bool nowait)
{
struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
struct btrfs_root *root = inode->root;
@@ -9018,6 +9018,9 @@ static ssize_t btrfs_encoded_read_inline(
ret = -ENOMEM;
goto out;
}
+
+ path->nowait = !!nowait;
+
ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
extent_start, 0);
if (ret) {
@@ -9254,7 +9257,7 @@ ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter,
ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
struct btrfs_ioctl_encoded_io_args *encoded,
- struct extent_state **cached_state,
+ bool nowait, struct extent_state **cached_state,
u64 *disk_bytenr, u64 *disk_io_size)
{
struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
@@ -9268,7 +9271,10 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
file_accessed(iocb->ki_filp);
- btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
+ ret = btrfs_inode_lock(inode,
+ BTRFS_ILOCK_SHARED | (nowait ? BTRFS_ILOCK_TRY : 0));
+ if (ret)
+ return ret;
if (iocb->ki_pos >= inode->vfs_inode.i_size) {
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
@@ -9281,21 +9287,45 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
*/
lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
- for (;;) {
+ if (nowait) {
struct btrfs_ordered_extent *ordered;
- ret = btrfs_wait_ordered_range(inode, start,
- lockend - start + 1);
- if (ret)
+ if (filemap_range_needs_writeback(inode->vfs_inode.i_mapping,
+ start, lockend)) {
+ ret = -EAGAIN;
goto out_unlock_inode;
- lock_extent(io_tree, start, lockend, cached_state);
+ }
+
+ if (!try_lock_extent(io_tree, start, lockend, cached_state)) {
+ ret = -EAGAIN;
+ goto out_unlock_inode;
+ }
+
ordered = btrfs_lookup_ordered_range(inode, start,
lockend - start + 1);
- if (!ordered)
- break;
- btrfs_put_ordered_extent(ordered);
- unlock_extent(io_tree, start, lockend, cached_state);
- cond_resched();
+ if (ordered) {
+ btrfs_put_ordered_extent(ordered);
+ unlock_extent(io_tree, start, lockend, cached_state);
+ ret = -EAGAIN;
+ goto out_unlock_inode;
+ }
+ } else {
+ for (;;) {
+ struct btrfs_ordered_extent *ordered;
+
+ ret = btrfs_wait_ordered_range(inode, start,
+ lockend - start + 1);
+ if (ret)
+ goto out_unlock_inode;
+ lock_extent(io_tree, start, lockend, cached_state);
+ ordered = btrfs_lookup_ordered_range(inode, start,
+ lockend - start + 1);
+ if (!ordered)
+ break;
+ btrfs_put_ordered_extent(ordered);
+ unlock_extent(io_tree, start, lockend, cached_state);
+ cond_resched();
+ }
}
em = btrfs_get_extent(inode, NULL, start, lockend - start + 1);
@@ -9315,7 +9345,8 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
em = NULL;
ret = btrfs_encoded_read_inline(iocb, iter, start, lockend,
cached_state, extent_start,
- count, encoded, &unlocked);
+ count, encoded, &unlocked,
+ nowait);
goto out_em;
}
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index e2ecf0bcda24..8c9ff4898ab0 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4579,7 +4579,7 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp,
init_sync_kiocb(&kiocb, file);
kiocb.ki_pos = pos;
- ret = btrfs_encoded_read(&kiocb, &iter, &args, &cached_state,
+ ret = btrfs_encoded_read(&kiocb, &iter, &args, false, &cached_state,
&disk_bytenr, &disk_io_size);
if (ret == -EIOCBQUEUED) {
--
2.44.2
^ permalink raw reply related [flat|nested] 17+ messages in thread
* Re: [PATCH 4/5] btrfs: add nowait parameter to btrfs_encoded_read
2024-10-14 17:18 ` [PATCH 4/5] btrfs: add nowait parameter to btrfs_encoded_read Mark Harmstone
@ 2024-10-14 22:12 ` Jens Axboe
2024-10-15 8:48 ` Mark Harmstone
0 siblings, 1 reply; 17+ messages in thread
From: Jens Axboe @ 2024-10-14 22:12 UTC (permalink / raw)
To: Mark Harmstone, linux-btrfs, io-uring
On 10/14/24 11:18 AM, Mark Harmstone wrote:
> Adds a nowait parameter to btrfs_encoded_read, which if it is true
> causes the function to return -EAGAIN rather than sleeping.
Can't we just rely on kiocb->ki_flags & IOCB_NOWAIT for this?
Doesn't really change the patch much, but you do avoid that extra
parameter.
--
Jens Axboe
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH 4/5] btrfs: add nowait parameter to btrfs_encoded_read
2024-10-14 22:12 ` Jens Axboe
@ 2024-10-15 8:48 ` Mark Harmstone
0 siblings, 0 replies; 17+ messages in thread
From: Mark Harmstone @ 2024-10-15 8:48 UTC (permalink / raw)
To: Jens Axboe, Mark Harmstone, [email protected],
[email protected]
On 14/10/24 23:12, Jens Axboe wrote:
> >
> On 10/14/24 11:18 AM, Mark Harmstone wrote:
>> Adds a nowait parameter to btrfs_encoded_read, which if it is true
>> causes the function to return -EAGAIN rather than sleeping.
>
> Can't we just rely on kiocb->ki_flags & IOCB_NOWAIT for this?
> Doesn't really change the patch much, but you do avoid that extra
> parameter.
Thanks Jens. Yes, that makes sense.
Mark
^ permalink raw reply [flat|nested] 17+ messages in thread
* [PATCH 5/5] btrfs: add io_uring command for encoded reads
2024-10-14 17:18 [PATCH v3 0/5] btrfs: encoded reads via io_uring Mark Harmstone
` (3 preceding siblings ...)
2024-10-14 17:18 ` [PATCH 4/5] btrfs: add nowait parameter to btrfs_encoded_read Mark Harmstone
@ 2024-10-14 17:18 ` Mark Harmstone
2024-10-21 13:50 ` David Sterba
2024-10-14 17:44 ` [PATCH v3 0/5] btrfs: encoded reads via io_uring Boris Burkov
5 siblings, 1 reply; 17+ messages in thread
From: Mark Harmstone @ 2024-10-14 17:18 UTC (permalink / raw)
To: linux-btrfs, io-uring; +Cc: Mark Harmstone
Adds an io_uring command for encoded reads, using the same interface as
the existing BTRFS_IOC_ENCODED_READ ioctl.
Signed-off-by: Mark Harmstone <[email protected]>
---
fs/btrfs/file.c | 1 +
fs/btrfs/ioctl.c | 283 +++++++++++++++++++++++++++++++++++++++++++++++
fs/btrfs/ioctl.h | 1 +
3 files changed, 285 insertions(+)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 2aeb8116549c..e33ca73fef8c 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3774,6 +3774,7 @@ const struct file_operations btrfs_file_operations = {
.compat_ioctl = btrfs_compat_ioctl,
#endif
.remap_file_range = btrfs_remap_file_range,
+ .uring_cmd = btrfs_uring_cmd,
.fop_flags = FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC,
};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8c9ff4898ab0..c0393575cf5e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -29,6 +29,7 @@
#include <linux/fileattr.h>
#include <linux/fsverity.h>
#include <linux/sched/xacct.h>
+#include <linux/io_uring/cmd.h>
#include "ctree.h"
#include "disk-io.h"
#include "export.h"
@@ -4723,6 +4724,288 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool
return ret;
}
+struct btrfs_uring_priv {
+ struct io_uring_cmd *cmd;
+ struct page **pages;
+ unsigned long nr_pages;
+ struct kiocb iocb;
+ struct iovec *iov;
+ struct iov_iter iter;
+ struct extent_state *cached_state;
+ u64 count;
+ bool compressed;
+ u64 start;
+ u64 lockend;
+};
+
+static void btrfs_uring_read_finished(struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
+{
+ struct btrfs_uring_priv *priv = (struct btrfs_uring_priv *)*(uintptr_t*)cmd->pdu;
+ struct btrfs_inode *inode = BTRFS_I(file_inode(priv->iocb.ki_filp));
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ unsigned long i;
+ u64 cur;
+ size_t page_offset;
+ ssize_t ret;
+
+ if (priv->compressed) {
+ i = 0;
+ page_offset = 0;
+ } else {
+ i = (priv->iocb.ki_pos - priv->start) >> PAGE_SHIFT;
+ page_offset = (priv->iocb.ki_pos - priv->start) & (PAGE_SIZE - 1);
+ }
+ cur = 0;
+ while (cur < priv->count) {
+ size_t bytes = min_t(size_t, priv->count - cur,
+ PAGE_SIZE - page_offset);
+
+ if (copy_page_to_iter(priv->pages[i], page_offset, bytes,
+ &priv->iter) != bytes) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ i++;
+ cur += bytes;
+ page_offset = 0;
+ }
+ ret = priv->count;
+
+out:
+ unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+
+ io_uring_cmd_done(cmd, ret, 0, issue_flags);
+ add_rchar(current, ret);
+
+ for (unsigned long i = 0; i < priv->nr_pages; i++) {
+ __free_page(priv->pages[i]);
+ }
+
+ kfree(priv->pages);
+ kfree(priv->iov);
+ kfree(priv);
+}
+
+static void btrfs_uring_read_extent_cb(void *ctx, int err)
+{
+ struct btrfs_uring_priv *priv = ctx;
+
+ *(uintptr_t*)priv->cmd->pdu = (uintptr_t)priv;
+ io_uring_cmd_complete_in_task(priv->cmd, btrfs_uring_read_finished);
+}
+
+static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter,
+ u64 start, u64 lockend,
+ struct extent_state *cached_state,
+ u64 disk_bytenr, u64 disk_io_size,
+ size_t count, bool compressed,
+ struct iovec *iov,
+ struct io_uring_cmd *cmd)
+{
+ struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ struct page **pages;
+ struct btrfs_uring_priv *priv = NULL;
+ unsigned long nr_pages;
+ int ret;
+
+ nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
+ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
+ if (!pages)
+ return -ENOMEM;
+ ret = btrfs_alloc_page_array(nr_pages, pages, 0);
+ if (ret) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ priv = kmalloc(sizeof(*priv), GFP_NOFS);
+ if (!priv) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ priv->iocb = *iocb;
+ priv->iov = iov;
+ priv->iter = *iter;
+ priv->count = count;
+ priv->cmd = cmd;
+ priv->cached_state = cached_state;
+ priv->compressed = compressed;
+ priv->nr_pages = nr_pages;
+ priv->pages = pages;
+ priv->start = start;
+ priv->lockend = lockend;
+
+ ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
+ disk_io_size, pages,
+ btrfs_uring_read_extent_cb,
+ priv);
+ if (ret)
+ goto fail;
+
+ return -EIOCBQUEUED;
+
+fail:
+ unlock_extent(io_tree, start, lockend, &cached_state);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+ kfree(priv);
+ return ret;
+}
+
+static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
+{
+ size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args,
+ flags);
+ size_t copy_end;
+ struct btrfs_ioctl_encoded_io_args args = {0};
+ int ret;
+ u64 disk_bytenr, disk_io_size;
+ struct file *file = cmd->file;
+ struct btrfs_inode *inode = BTRFS_I(file->f_inode);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ struct iovec iovstack[UIO_FASTIOV];
+ struct iovec *iov = iovstack;
+ struct iov_iter iter;
+ loff_t pos;
+ struct kiocb kiocb;
+ struct extent_state *cached_state = NULL;
+ u64 start, lockend;
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ ret = -EPERM;
+ goto out_acct;
+ }
+
+ if (issue_flags & IO_URING_F_COMPAT) {
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+ struct btrfs_ioctl_encoded_io_args_32 args32;
+
+ copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32,
+ flags);
+ if (copy_from_user(&args32, (const void *)cmd->sqe->addr,
+ copy_end)) {
+ ret = -EFAULT;
+ goto out_acct;
+ }
+ args.iov = compat_ptr(args32.iov);
+ args.iovcnt = args32.iovcnt;
+ args.offset = args32.offset;
+ args.flags = args32.flags;
+#else
+ return -ENOTTY;
+#endif
+ } else {
+ copy_end = copy_end_kernel;
+ if (copy_from_user(&args, (const void *)cmd->sqe->addr,
+ copy_end)) {
+ ret = -EFAULT;
+ goto out_acct;
+ }
+ }
+
+ if (args.flags != 0)
+ return -EINVAL;
+
+ ret = import_iovec(ITER_DEST, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
+ &iov, &iter);
+ if (ret < 0)
+ goto out_acct;
+
+ if (iov_iter_count(&iter) == 0) {
+ ret = 0;
+ goto out_free;
+ }
+
+ pos = args.offset;
+ ret = rw_verify_area(READ, file, &pos, args.len);
+ if (ret < 0)
+ goto out_free;
+
+ init_sync_kiocb(&kiocb, file);
+ kiocb.ki_pos = pos;
+
+ start = ALIGN_DOWN(pos, fs_info->sectorsize);
+ lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
+
+ ret = btrfs_encoded_read(&kiocb, &iter, &args,
+ issue_flags & IO_URING_F_NONBLOCK,
+ &cached_state, &disk_bytenr, &disk_io_size);
+ if (ret < 0 && ret != -EIOCBQUEUED)
+ goto out_free;
+
+ file_accessed(file);
+
+ if (copy_to_user((void*)(uintptr_t)cmd->sqe->addr + copy_end,
+ (char *)&args + copy_end_kernel,
+ sizeof(args) - copy_end_kernel)) {
+ if (ret == -EIOCBQUEUED) {
+ unlock_extent(io_tree, start, lockend, &cached_state);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+ }
+ ret = -EFAULT;
+ goto out_free;
+ }
+
+ if (ret == -EIOCBQUEUED) {
+ u64 count;
+
+ /* If we've optimized things by storing the iovecs on the stack,
+ * undo this. */
+ if (!iov) {
+ iov = kmalloc(sizeof(struct iovec) * args.iovcnt,
+ GFP_NOFS);
+ if (!iov) {
+ unlock_extent(io_tree, start, lockend,
+ &cached_state);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+ ret = -ENOMEM;
+ goto out_acct;
+ }
+
+ memcpy(iov, iovstack,
+ sizeof(struct iovec) * args.iovcnt);
+ }
+
+ count = min_t(u64, iov_iter_count(&iter), disk_io_size);
+
+ ret = btrfs_uring_read_extent(&kiocb, &iter, start, lockend,
+ cached_state, disk_bytenr,
+ disk_io_size, count,
+ args.compression, iov, cmd);
+
+ goto out_acct;
+ }
+
+out_free:
+ kfree(iov);
+
+out_acct:
+ if (ret > 0)
+ add_rchar(current, ret);
+ inc_syscr(current);
+
+ return ret;
+}
+
+int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+ switch (cmd->cmd_op) {
+ case BTRFS_IOC_ENCODED_READ:
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+ case BTRFS_IOC_ENCODED_READ_32:
+#endif
+ return btrfs_uring_encoded_read(cmd, issue_flags);
+ }
+
+ return -EINVAL;
+}
+
long btrfs_ioctl(struct file *file, unsigned int
cmd, unsigned long arg)
{
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 19cd26b0244a..288f4f5c4409 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -22,5 +22,6 @@ void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
int __pure btrfs_is_empty_uuid(const u8 *uuid);
void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_balance_args *bargs);
+int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags);
#endif
--
2.44.2
^ permalink raw reply related [flat|nested] 17+ messages in thread
* Re: [PATCH 5/5] btrfs: add io_uring command for encoded reads
2024-10-14 17:18 ` [PATCH 5/5] btrfs: add io_uring command for encoded reads Mark Harmstone
@ 2024-10-21 13:50 ` David Sterba
2024-10-21 16:15 ` Pavel Begunkov
2024-10-21 17:05 ` Mark Harmstone
0 siblings, 2 replies; 17+ messages in thread
From: David Sterba @ 2024-10-21 13:50 UTC (permalink / raw)
To: Mark Harmstone; +Cc: linux-btrfs, io-uring
On Mon, Oct 14, 2024 at 06:18:27PM +0100, Mark Harmstone wrote:
> Adds an io_uring command for encoded reads, using the same interface as
Add ...
> the existing BTRFS_IOC_ENCODED_READ ioctl.
This is probably a good summary in a changelog but the patch is quite
long so it feels like this should be described in a more detail how it's
done. Connecting two interfaces can be done in various ways, so at least
mention that it's a simple pass through, or if there are any
complications regardign locking, object lifetime and such.
> Signed-off-by: Mark Harmstone <[email protected]>
> ---
> fs/btrfs/file.c | 1 +
> fs/btrfs/ioctl.c | 283 +++++++++++++++++++++++++++++++++++++++++++++++
> fs/btrfs/ioctl.h | 1 +
> 3 files changed, 285 insertions(+)
>
> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> index 2aeb8116549c..e33ca73fef8c 100644
> --- a/fs/btrfs/file.c
> +++ b/fs/btrfs/file.c
> @@ -3774,6 +3774,7 @@ const struct file_operations btrfs_file_operations = {
> .compat_ioctl = btrfs_compat_ioctl,
> #endif
> .remap_file_range = btrfs_remap_file_range,
> + .uring_cmd = btrfs_uring_cmd,
> .fop_flags = FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC,
> };
>
> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> index 8c9ff4898ab0..c0393575cf5e 100644
> --- a/fs/btrfs/ioctl.c
> +++ b/fs/btrfs/ioctl.c
> @@ -29,6 +29,7 @@
> #include <linux/fileattr.h>
> #include <linux/fsverity.h>
> #include <linux/sched/xacct.h>
> +#include <linux/io_uring/cmd.h>
> #include "ctree.h"
> #include "disk-io.h"
> #include "export.h"
> @@ -4723,6 +4724,288 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool
> return ret;
> }
>
> +struct btrfs_uring_priv {
> + struct io_uring_cmd *cmd;
> + struct page **pages;
> + unsigned long nr_pages;
> + struct kiocb iocb;
> + struct iovec *iov;
> + struct iov_iter iter;
> + struct extent_state *cached_state;
> + u64 count;
> + bool compressed;
This leaves a 7 byte hole.
> + u64 start;
> + u64 lockend;
> +};
The whole structure should be documented and the members too if it's not
obvious what they are used for.
> +
> +static void btrfs_uring_read_finished(struct io_uring_cmd *cmd,
> + unsigned int issue_flags)
> +{
> + struct btrfs_uring_priv *priv = (struct btrfs_uring_priv *)*(uintptr_t*)cmd->pdu;
> + struct btrfs_inode *inode = BTRFS_I(file_inode(priv->iocb.ki_filp));
> + struct extent_io_tree *io_tree = &inode->io_tree;
> + unsigned long i;
Why is this long?
> + u64 cur;
> + size_t page_offset;
> + ssize_t ret;
> +
> + if (priv->compressed) {
> + i = 0;
> + page_offset = 0;
> + } else {
> + i = (priv->iocb.ki_pos - priv->start) >> PAGE_SHIFT;
> + page_offset = (priv->iocb.ki_pos - priv->start) & (PAGE_SIZE - 1);
Please don't open code page_offset()
> + }
> + cur = 0;
> + while (cur < priv->count) {
> + size_t bytes = min_t(size_t, priv->count - cur,
> + PAGE_SIZE - page_offset);
> +
> + if (copy_page_to_iter(priv->pages[i], page_offset, bytes,
> + &priv->iter) != bytes) {
> + ret = -EFAULT;
> + goto out;
> + }
> +
> + i++;
> + cur += bytes;
> + page_offset = 0;
> + }
> + ret = priv->count;
> +
> +out:
> + unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state);
> + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
> +
> + io_uring_cmd_done(cmd, ret, 0, issue_flags);
> + add_rchar(current, ret);
> +
> + for (unsigned long i = 0; i < priv->nr_pages; i++) {
Shadowing 'i' of the same type as is declared in the function. Maybe
don't call it just 'i' but index as it's used outside of a loop.
> + __free_page(priv->pages[i]);
> + }
Please drop the outer { } for a single statement block.
> +
> + kfree(priv->pages);
> + kfree(priv->iov);
> + kfree(priv);
> +}
> +
> +static void btrfs_uring_read_extent_cb(void *ctx, int err)
> +{
> + struct btrfs_uring_priv *priv = ctx;
> +
> + *(uintptr_t*)priv->cmd->pdu = (uintptr_t)priv;
Isn't there a helper for that? Type casting should be done in justified
cases and as an exception.
> + io_uring_cmd_complete_in_task(priv->cmd, btrfs_uring_read_finished);
> +}
> +
> +static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter,
> + u64 start, u64 lockend,
> + struct extent_state *cached_state,
> + u64 disk_bytenr, u64 disk_io_size,
> + size_t count, bool compressed,
> + struct iovec *iov,
> + struct io_uring_cmd *cmd)
> +{
> + struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
> + struct extent_io_tree *io_tree = &inode->io_tree;
> + struct page **pages;
> + struct btrfs_uring_priv *priv = NULL;
> + unsigned long nr_pages;
> + int ret;
> +
> + nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
> + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
> + if (!pages)
> + return -ENOMEM;
> + ret = btrfs_alloc_page_array(nr_pages, pages, 0);
The allocation sizes are derived from disk_io_size that comes from the
outside, potentially making large allocatoins. Or is there some inherent
limit on the maximu size?
> + if (ret) {
> + ret = -ENOMEM;
> + goto fail;
> + }
> +
> + priv = kmalloc(sizeof(*priv), GFP_NOFS);
> + if (!priv) {
> + ret = -ENOMEM;
> + goto fail;
> + }
> +
> + priv->iocb = *iocb;
> + priv->iov = iov;
> + priv->iter = *iter;
> + priv->count = count;
> + priv->cmd = cmd;
> + priv->cached_state = cached_state;
> + priv->compressed = compressed;
> + priv->nr_pages = nr_pages;
> + priv->pages = pages;
> + priv->start = start;
> + priv->lockend = lockend;
> +
> + ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
> + disk_io_size, pages,
> + btrfs_uring_read_extent_cb,
> + priv);
> + if (ret)
> + goto fail;
> +
> + return -EIOCBQUEUED;
> +
> +fail:
> + unlock_extent(io_tree, start, lockend, &cached_state);
> + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
> + kfree(priv);
Does this leak pages and priv->pages?
> + return ret;
> +}
> +
> +static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd,
> + unsigned int issue_flags)
> +{
> + size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args,
> + flags);
> + size_t copy_end;
> + struct btrfs_ioctl_encoded_io_args args = {0};
= { 0 }
> + int ret;
> + u64 disk_bytenr, disk_io_size;
> + struct file *file = cmd->file;
> + struct btrfs_inode *inode = BTRFS_I(file->f_inode);
> + struct btrfs_fs_info *fs_info = inode->root->fs_info;
> + struct extent_io_tree *io_tree = &inode->io_tree;
> + struct iovec iovstack[UIO_FASTIOV];
> + struct iovec *iov = iovstack;
> + struct iov_iter iter;
> + loff_t pos;
> + struct kiocb kiocb;
> + struct extent_state *cached_state = NULL;
> + u64 start, lockend;
The stack consumption looks quite high.
> +
> + if (!capable(CAP_SYS_ADMIN)) {
> + ret = -EPERM;
> + goto out_acct;
> + }
> +
> + if (issue_flags & IO_URING_F_COMPAT) {
> +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
> + struct btrfs_ioctl_encoded_io_args_32 args32;
> +
> + copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32,
> + flags);
> + if (copy_from_user(&args32, (const void *)cmd->sqe->addr,
> + copy_end)) {
> + ret = -EFAULT;
> + goto out_acct;
> + }
> + args.iov = compat_ptr(args32.iov);
> + args.iovcnt = args32.iovcnt;
> + args.offset = args32.offset;
> + args.flags = args32.flags;
> +#else
> + return -ENOTTY;
> +#endif
> + } else {
> + copy_end = copy_end_kernel;
> + if (copy_from_user(&args, (const void *)cmd->sqe->addr,
> + copy_end)) {
> + ret = -EFAULT;
> + goto out_acct;
> + }
> + }
> +
> + if (args.flags != 0)
> + return -EINVAL;
> +
> + ret = import_iovec(ITER_DEST, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
> + &iov, &iter);
> + if (ret < 0)
> + goto out_acct;
> +
> + if (iov_iter_count(&iter) == 0) {
> + ret = 0;
> + goto out_free;
> + }
> +
> + pos = args.offset;
> + ret = rw_verify_area(READ, file, &pos, args.len);
> + if (ret < 0)
> + goto out_free;
> +
> + init_sync_kiocb(&kiocb, file);
> + kiocb.ki_pos = pos;
> +
> + start = ALIGN_DOWN(pos, fs_info->sectorsize);
> + lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
> +
> + ret = btrfs_encoded_read(&kiocb, &iter, &args,
> + issue_flags & IO_URING_F_NONBLOCK,
> + &cached_state, &disk_bytenr, &disk_io_size);
> + if (ret < 0 && ret != -EIOCBQUEUED)
> + goto out_free;
> +
> + file_accessed(file);
> +
> + if (copy_to_user((void*)(uintptr_t)cmd->sqe->addr + copy_end,
> + (char *)&args + copy_end_kernel,
So many type casts again
> + sizeof(args) - copy_end_kernel)) {
> + if (ret == -EIOCBQUEUED) {
> + unlock_extent(io_tree, start, lockend, &cached_state);
> + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
> + }
> + ret = -EFAULT;
> + goto out_free;
> + }
> +
> + if (ret == -EIOCBQUEUED) {
> + u64 count;
> +
> + /* If we've optimized things by storing the iovecs on the stack,
> + * undo this. */
This is not proper comment formatting.
> + if (!iov) {
> + iov = kmalloc(sizeof(struct iovec) * args.iovcnt,
> + GFP_NOFS);
> + if (!iov) {
> + unlock_extent(io_tree, start, lockend,
> + &cached_state);
> + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
> + ret = -ENOMEM;
> + goto out_acct;
> + }
> +
> + memcpy(iov, iovstack,
> + sizeof(struct iovec) * args.iovcnt);
> + }
> +
> + count = min_t(u64, iov_iter_count(&iter), disk_io_size);
> +
> + ret = btrfs_uring_read_extent(&kiocb, &iter, start, lockend,
> + cached_state, disk_bytenr,
> + disk_io_size, count,
> + args.compression, iov, cmd);
> +
> + goto out_acct;
> + }
> +
> +out_free:
> + kfree(iov);
> +
> +out_acct:
> + if (ret > 0)
> + add_rchar(current, ret);
> + inc_syscr(current);
> +
> + return ret;
> +}
> +
> +int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
> +{
> + switch (cmd->cmd_op) {
> + case BTRFS_IOC_ENCODED_READ:
> +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
> + case BTRFS_IOC_ENCODED_READ_32:
> +#endif
> + return btrfs_uring_encoded_read(cmd, issue_flags);
> + }
> +
> + return -EINVAL;
> +}
> +
> long btrfs_ioctl(struct file *file, unsigned int
> cmd, unsigned long arg)
> {
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH 5/5] btrfs: add io_uring command for encoded reads
2024-10-21 13:50 ` David Sterba
@ 2024-10-21 16:15 ` Pavel Begunkov
2024-10-21 17:05 ` Mark Harmstone
1 sibling, 0 replies; 17+ messages in thread
From: Pavel Begunkov @ 2024-10-21 16:15 UTC (permalink / raw)
To: dsterba, Mark Harmstone; +Cc: linux-btrfs, io-uring
On 10/21/24 14:50, David Sterba wrote:
> On Mon, Oct 14, 2024 at 06:18:27PM +0100, Mark Harmstone wrote:
>> Adds an io_uring command for encoded reads, using the same interface as
>
> Add ...
>
>> the existing BTRFS_IOC_ENCODED_READ ioctl.
>
> This is probably a good summary in a changelog but the patch is quite
> long so it feels like this should be described in a more detail how it's
> done. Connecting two interfaces can be done in various ways, so at least
> mention that it's a simple pass through, or if there are any
> complications regardign locking, object lifetime and such.
>
>> Signed-off-by: Mark Harmstone <[email protected]>
...
>> +
>> + kfree(priv->pages);
>> + kfree(priv->iov);
>> + kfree(priv);
>> +}
>> +
>> +static void btrfs_uring_read_extent_cb(void *ctx, int err)
>> +{
>> + struct btrfs_uring_priv *priv = ctx;
>> +
>> + *(uintptr_t*)priv->cmd->pdu = (uintptr_t)priv;
>
> Isn't there a helper for that? Type casting should be done in justified
> cases and as an exception.
FWIW, I haven't taken a look yet, but for this one, please use
io_uring_cmd_to_pdu(cmd, type), it'll check the size for you.
And there in no need to cast it to uintptr, would be much nicer
to
struct btrfs_cmd {
struct btrfs_uring_priv *priv;
};
struct btrfs_cmd *bc = io_uring_cmd_to_pdu(priv->cmd, struct btrfs_cmd);
bc->priv = priv;
You get more type checking this way. You can also wrap around
io_uring_cmd_to_pdu() into a static inline helper, if that
looks better.
...>> +
>> + start = ALIGN_DOWN(pos, fs_info->sectorsize);
>> + lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
>> +
>> + ret = btrfs_encoded_read(&kiocb, &iter, &args,
>> + issue_flags & IO_URING_F_NONBLOCK,
>> + &cached_state, &disk_bytenr, &disk_io_size);
>> + if (ret < 0 && ret != -EIOCBQUEUED)
>> + goto out_free;
>> +
>> + file_accessed(file);
>> +
>> + if (copy_to_user((void*)(uintptr_t)cmd->sqe->addr + copy_end,
>> + (char *)&args + copy_end_kernel,
Be aware, SQE data is not stable, you should assume that the user
space can change it at any moment. It should be a READ_ONCE, and
likely you don't want it to be read twice, unless you handle it /
verify values / etc. I'd recommend to save it early in the callback
and stash somewhere, e.g. into struct btrfs_cmd I mentioned above.
>
> So many type casts again
>
--
Pavel Begunkov
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH 5/5] btrfs: add io_uring command for encoded reads
2024-10-21 13:50 ` David Sterba
2024-10-21 16:15 ` Pavel Begunkov
@ 2024-10-21 17:05 ` Mark Harmstone
2024-10-21 18:23 ` David Sterba
1 sibling, 1 reply; 17+ messages in thread
From: Mark Harmstone @ 2024-10-21 17:05 UTC (permalink / raw)
To: [email protected]; +Cc: [email protected], [email protected]
Thanks David.
On 21/10/24 14:50, David Sterba wrote:
>> +static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter,
>> + u64 start, u64 lockend,
>> + struct extent_state *cached_state,
>> + u64 disk_bytenr, u64 disk_io_size,
>> + size_t count, bool compressed,
>> + struct iovec *iov,
>> + struct io_uring_cmd *cmd)
>> +{
>> + struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
>> + struct extent_io_tree *io_tree = &inode->io_tree;
>> + struct page **pages;
>> + struct btrfs_uring_priv *priv = NULL;
>> + unsigned long nr_pages;
>> + int ret;
>> +
>> + nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
>> + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
>> + if (!pages)
>> + return -ENOMEM;
>> + ret = btrfs_alloc_page_array(nr_pages, pages, 0);
>
> The allocation sizes are derived from disk_io_size that comes from the
> outside, potentially making large allocatoins. Or is there some inherent
> limit on the maximu size?
Yes. It comes from btrfs_encoded_read, where it's limited to
BTRFS_MAX_UNCOMPRESSED (i.e. 128KB).
>> + if (ret) {
>> + ret = -ENOMEM;
>> + goto fail;
>> + }
>> +
>> + priv = kmalloc(sizeof(*priv), GFP_NOFS);
>> + if (!priv) {
>> + ret = -ENOMEM;
>> + goto fail;
>> + }
>> +
>> + priv->iocb = *iocb;
>> + priv->iov = iov;
>> + priv->iter = *iter;
>> + priv->count = count;
>> + priv->cmd = cmd;
>> + priv->cached_state = cached_state;
>> + priv->compressed = compressed;
>> + priv->nr_pages = nr_pages;
>> + priv->pages = pages;
>> + priv->start = start;
>> + priv->lockend = lockend;
>> +
>> + ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
>> + disk_io_size, pages,
>> + btrfs_uring_read_extent_cb,
>> + priv);
>> + if (ret)
>> + goto fail;
>> +
>> + return -EIOCBQUEUED;
>> +
>> +fail:
>> + unlock_extent(io_tree, start, lockend, &cached_state);
>> + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
>> + kfree(priv);
>
> Does this leak pages and priv->pages?
No, they get freed in btrfs_uring_read_finished.
>> + return ret;
>> +}
>> +
>> +static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd,
>> + unsigned int issue_flags)
>> +{
>> + size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args,
>> + flags);
>> + size_t copy_end;
>> + struct btrfs_ioctl_encoded_io_args args = {0};
> = { 0 }
>> + int ret;
>> + u64 disk_bytenr, disk_io_size;
>> + struct file *file = cmd->file;
>> + struct btrfs_inode *inode = BTRFS_I(file->f_inode);
>> + struct btrfs_fs_info *fs_info = inode->root->fs_info;
>> + struct extent_io_tree *io_tree = &inode->io_tree;
>> + struct iovec iovstack[UIO_FASTIOV];
>> + struct iovec *iov = iovstack;
>> + struct iov_iter iter;
>> + loff_t pos;
>> + struct kiocb kiocb;
>> + struct extent_state *cached_state = NULL;
>> + u64 start, lockend;
>
> The stack consumption looks quite high.
696 bytes, compared to 672 in btrfs_ioctl_encoded_read.
btrfs_ioctl_encoded write is pretty big too. Probably the easiest thing
here would be to allocate btrfs_uring_priv early and pass that around, I
think.
Do you have a recommendation for what the maximum stack size of a
function should be?
Mark
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH 5/5] btrfs: add io_uring command for encoded reads
2024-10-21 17:05 ` Mark Harmstone
@ 2024-10-21 18:23 ` David Sterba
2024-10-22 9:12 ` Mark Harmstone
0 siblings, 1 reply; 17+ messages in thread
From: David Sterba @ 2024-10-21 18:23 UTC (permalink / raw)
To: Mark Harmstone; +Cc: [email protected], [email protected]
On Mon, Oct 21, 2024 at 05:05:20PM +0000, Mark Harmstone wrote:
> >> +static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd,
> >> + unsigned int issue_flags)
> >> +{
> >> + size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args,
> >> + flags);
> >> + size_t copy_end;
> >> + struct btrfs_ioctl_encoded_io_args args = {0};
> > = { 0 }
> >> + int ret;
> >> + u64 disk_bytenr, disk_io_size;
> >> + struct file *file = cmd->file;
> >> + struct btrfs_inode *inode = BTRFS_I(file->f_inode);
> >> + struct btrfs_fs_info *fs_info = inode->root->fs_info;
> >> + struct extent_io_tree *io_tree = &inode->io_tree;
> >> + struct iovec iovstack[UIO_FASTIOV];
> >> + struct iovec *iov = iovstack;
> >> + struct iov_iter iter;
> >> + loff_t pos;
> >> + struct kiocb kiocb;
> >> + struct extent_state *cached_state = NULL;
> >> + u64 start, lockend;
> >
> > The stack consumption looks quite high.
>
> 696 bytes, compared to 672 in btrfs_ioctl_encoded_read.
> btrfs_ioctl_encoded write is pretty big too. Probably the easiest thing
> here would be to allocate btrfs_uring_priv early and pass that around, I
> think.
>
> Do you have a recommendation for what the maximum stack size of a
> function should be?
It depends from where the function is called. For ioctl callbacks, like
btrfs_ioctl_encoded_read it's the first function using kernel stack
leaving enough for any deep IO stacks (DM/NFS/iSCSI/...). If something
similar applies to the io_uring callbacks then it's probably fine.
Using a separate off-stack structure works but it's a penalty as it
needs the allcation. The io_uring is meant for high performance so if
the on-stack allocation is safe then keep it like that.
I've checked on a release config the stack consumption and the encoded
ioctl functions are not the worst:
tree-log.c:btrfs_sync_log 728 static
scrub.c:scrub_verify_one_metadata 552 dynamic,bounded
inode.c:print_data_reloc_error 544 dynamic,bounded
uuid-tree.c:btrfs_uuid_scan_kthread 520 static
tree-checker.c:check_root_item 504 static
file-item.c:btrfs_csum_one_bio 496 static
inode.c:btrfs_start_delalloc_roots 488 static
scrub.c:scrub_raid56_parity_stripe 464 dynamic,bounded
disk-io.c:write_dev_supers 464 static
ioctl.c:btrfs_ioctl_encoded_write 456 dynamic,bounded
ioctl.c:btrfs_ioctl_encoded_read 456 dynamic,bounded
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH 5/5] btrfs: add io_uring command for encoded reads
2024-10-21 18:23 ` David Sterba
@ 2024-10-22 9:12 ` Mark Harmstone
0 siblings, 0 replies; 17+ messages in thread
From: Mark Harmstone @ 2024-10-22 9:12 UTC (permalink / raw)
To: [email protected]; +Cc: [email protected], [email protected]
On 21/10/24 19:23, David Sterba wrote:
> >
> On Mon, Oct 21, 2024 at 05:05:20PM +0000, Mark Harmstone wrote:
>>>> +static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd,
>>>> + unsigned int issue_flags)
>>>> +{
>>>> + size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args,
>>>> + flags);
>>>> + size_t copy_end;
>>>> + struct btrfs_ioctl_encoded_io_args args = {0};
>>> = { 0 }
>>>> + int ret;
>>>> + u64 disk_bytenr, disk_io_size;
>>>> + struct file *file = cmd->file;
>>>> + struct btrfs_inode *inode = BTRFS_I(file->f_inode);
>>>> + struct btrfs_fs_info *fs_info = inode->root->fs_info;
>>>> + struct extent_io_tree *io_tree = &inode->io_tree;
>>>> + struct iovec iovstack[UIO_FASTIOV];
>>>> + struct iovec *iov = iovstack;
>>>> + struct iov_iter iter;
>>>> + loff_t pos;
>>>> + struct kiocb kiocb;
>>>> + struct extent_state *cached_state = NULL;
>>>> + u64 start, lockend;
>>>
>>> The stack consumption looks quite high.
>>
>> 696 bytes, compared to 672 in btrfs_ioctl_encoded_read.
>> btrfs_ioctl_encoded write is pretty big too. Probably the easiest thing
>> here would be to allocate btrfs_uring_priv early and pass that around, I
>> think.
>>
>> Do you have a recommendation for what the maximum stack size of a
>> function should be?
>
> It depends from where the function is called. For ioctl callbacks, like
> btrfs_ioctl_encoded_read it's the first function using kernel stack
> leaving enough for any deep IO stacks (DM/NFS/iSCSI/...). If something
> similar applies to the io_uring callbacks then it's probably fine.
Thanks. Yes, the two should functions should be broadly equivalent.
> Using a separate off-stack structure works but it's a penalty as it
> needs the allcation. The io_uring is meant for high performance so if
> the on-stack allocation is safe then keep it like that.
Okay, I'll leave this bit as it is, then. I can revisit it if we start
getting a spike of stack overflow crashes mentioning
btrfs_uring_encoded_read.
>
> I've checked on a release config the stack consumption and the encoded
> ioctl functions are not the worst:
>
> tree-log.c:btrfs_sync_log 728 static
> scrub.c:scrub_verify_one_metadata 552 dynamic,bounded
> inode.c:print_data_reloc_error 544 dynamic,bounded
> uuid-tree.c:btrfs_uuid_scan_kthread 520 static
> tree-checker.c:check_root_item 504 static
> file-item.c:btrfs_csum_one_bio 496 static
> inode.c:btrfs_start_delalloc_roots 488 static
> scrub.c:scrub_raid56_parity_stripe 464 dynamic,bounded
> disk-io.c:write_dev_supers 464 static
> ioctl.c:btrfs_ioctl_encoded_write 456 dynamic,bounded
> ioctl.c:btrfs_ioctl_encoded_read 456 dynamic,bounded
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH v3 0/5] btrfs: encoded reads via io_uring
2024-10-14 17:18 [PATCH v3 0/5] btrfs: encoded reads via io_uring Mark Harmstone
` (4 preceding siblings ...)
2024-10-14 17:18 ` [PATCH 5/5] btrfs: add io_uring command for encoded reads Mark Harmstone
@ 2024-10-14 17:44 ` Boris Burkov
2024-10-15 8:50 ` Mark Harmstone
5 siblings, 1 reply; 17+ messages in thread
From: Boris Burkov @ 2024-10-14 17:44 UTC (permalink / raw)
To: Mark Harmstone; +Cc: linux-btrfs, io-uring
On Mon, Oct 14, 2024 at 06:18:22PM +0100, Mark Harmstone wrote:
> This is a re-do of my previous patchsets: I wasn't happy with how
> synchronous the previous version was in many ways, nor quite how badly
> it butchered the existing ioctl.
>
> This adds an io_uring cmd to btrfs to match the behaviour of the
> existing BTRFS_IOC_ENCODED_READ ioctl, which allows the reading of
> potentially compressed extents directly from the disk.
>
> Pavel mentioned on the previous patches whether we definitely need to
> keep the inode and the extent locked while doing I/O; I think the answer
> is probably yes, a) to prevent races with no-COW extents, and b) to
> prevent the extent from being deallocated from under us. But I think
> it's possible to resolve this, as a future optimization.
What branch is this based off of? I attempted to apply it to the current
btrfs for-next and
"btrfs: change btrfs_encoded_read_regular_fill_pages to take a callback"
did not apply cleanly.
>
> Mark Harmstone (5):
> btrfs: remove pointless addition in btrfs_encoded_read
> btrfs: change btrfs_encoded_read_regular_fill_pages to take a callback
> btrfs: change btrfs_encoded_read so that reading of extent is done by
> caller
> btrfs: add nowait parameter to btrfs_encoded_read
> btrfs: add io_uring command for encoded reads
>
> fs/btrfs/btrfs_inode.h | 23 ++-
> fs/btrfs/file.c | 1 +
> fs/btrfs/inode.c | 186 ++++++++++++++++--------
> fs/btrfs/ioctl.c | 316 ++++++++++++++++++++++++++++++++++++++++-
> fs/btrfs/ioctl.h | 1 +
> fs/btrfs/send.c | 15 +-
> 6 files changed, 476 insertions(+), 66 deletions(-)
>
> --
> 2.44.2
>
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH v3 0/5] btrfs: encoded reads via io_uring
2024-10-14 17:44 ` [PATCH v3 0/5] btrfs: encoded reads via io_uring Boris Burkov
@ 2024-10-15 8:50 ` Mark Harmstone
0 siblings, 0 replies; 17+ messages in thread
From: Mark Harmstone @ 2024-10-15 8:50 UTC (permalink / raw)
To: Boris Burkov, Mark Harmstone
Cc: [email protected], [email protected]
On 14/10/24 18:44, Boris Burkov wrote:
> >
> On Mon, Oct 14, 2024 at 06:18:22PM +0100, Mark Harmstone wrote:
>> This is a re-do of my previous patchsets: I wasn't happy with how
>> synchronous the previous version was in many ways, nor quite how badly
>> it butchered the existing ioctl.
>>
>> This adds an io_uring cmd to btrfs to match the behaviour of the
>> existing BTRFS_IOC_ENCODED_READ ioctl, which allows the reading of
>> potentially compressed extents directly from the disk.
>>
>> Pavel mentioned on the previous patches whether we definitely need to
>> keep the inode and the extent locked while doing I/O; I think the answer
>> is probably yes, a) to prevent races with no-COW extents, and b) to
>> prevent the extent from being deallocated from under us. But I think
>> it's possible to resolve this, as a future optimization.
>
> What branch is this based off of? I attempted to apply it to the current
> btrfs for-next and
> "btrfs: change btrfs_encoded_read_regular_fill_pages to take a callback"
> did not apply cleanly.
This is against v6.11, because it's the latest stable version. I'm
guessing it ought to have been against upstream/master...
Mark
^ permalink raw reply [flat|nested] 17+ messages in thread