From: Pavel Begunkov <[email protected]>
To: Mark Harmstone <[email protected]>, [email protected]
Cc: [email protected]
Subject: Re: [PATCH 5/5] btrfs: add io_uring command for encoded reads
Date: Wed, 30 Oct 2024 00:59:33 +0000 [thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>
On 10/22/24 15:50, Mark Harmstone wrote:
...
> +static void btrfs_uring_read_finished(struct io_uring_cmd *cmd,
> + unsigned int issue_flags)
> +{
> + struct btrfs_uring_priv *priv =
> + *io_uring_cmd_to_pdu(cmd, struct btrfs_uring_priv *);
> + struct btrfs_inode *inode = BTRFS_I(file_inode(priv->iocb.ki_filp));
> + struct extent_io_tree *io_tree = &inode->io_tree;
> + unsigned long i;
> + u64 cur;
> + size_t page_offset;
> + ssize_t ret;
> +
> + if (priv->err) {
> + ret = priv->err;
> + goto out;
> + }
> +
> + if (priv->compressed) {
> + i = 0;
> + page_offset = 0;
> + } else {
> + i = (priv->iocb.ki_pos - priv->start) >> PAGE_SHIFT;
> + page_offset = offset_in_page(priv->iocb.ki_pos - priv->start);
> + }
> + cur = 0;
> + while (cur < priv->count) {
> + size_t bytes = min_t(size_t, priv->count - cur,
> + PAGE_SIZE - page_offset);
> +
> + if (copy_page_to_iter(priv->pages[i], page_offset, bytes,
> + &priv->iter) != bytes) {
If that's an iovec backed iter that might fail, you'd need to
steal this patch
https://lore.kernel.org/all/[email protected]/
and fail if "issue_flags & IO_URING_F_TASK_DEAD", see
https://lore.kernel.org/all/[email protected]/
> + ret = -EFAULT;
> + goto out;
> + }
> +
> + i++;
> + cur += bytes;
> + page_offset = 0;
> + }
> + ret = priv->count;
> +
> +out:
> + unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state);
> + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
When called via io_uring_cmd_complete_in_task() this function might
not get run in any reasonable amount of time. Even worse, a
misbehaving user can block it until the task dies.
I don't remember if rwsem allows unlock being executed in a different
task than the pairing lock, but blocking it for that long could be a
problem. I might not remember it right but I think Boris meantioned
that the O_DIRECT path drops the inode lock right after submission
without waiting for bios to complete. Is that right? Can we do it
here as well?
> +
> + io_uring_cmd_done(cmd, ret, 0, issue_flags);
> + add_rchar(current, ret);
> +
> + for (unsigned long index = 0; index < priv->nr_pages; index++)
> + __free_page(priv->pages[index]);
> +
> + kfree(priv->pages);
> + kfree(priv->iov);
> + kfree(priv);
> +}
> +
> +void btrfs_uring_read_extent_endio(void *ctx, int err)
> +{
> + struct btrfs_uring_priv *priv = ctx;
> +
> + priv->err = err;
> +
> + *io_uring_cmd_to_pdu(priv->cmd, struct btrfs_uring_priv *) = priv;
a nit, I'd suggest to create a temp var, should be easier to
read. It'd even be nicer if you wrap it into a structure
as suggested last time.
struct io_btrfs_cmd {
struct btrfs_uring_priv *priv;
};
struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd);
bc->priv = priv;
> + io_uring_cmd_complete_in_task(priv->cmd, btrfs_uring_read_finished);
> +}
> +
> +static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter,
> + u64 start, u64 lockend,
> + struct extent_state *cached_state,
> + u64 disk_bytenr, u64 disk_io_size,
> + size_t count, bool compressed,
> + struct iovec *iov,
> + struct io_uring_cmd *cmd)
> +{
> + struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
> + struct extent_io_tree *io_tree = &inode->io_tree;
> + struct page **pages;
> + struct btrfs_uring_priv *priv = NULL;
> + unsigned long nr_pages;
> + int ret;
> +
> + nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
> + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
> + if (!pages)
> + return -ENOMEM;
> + ret = btrfs_alloc_page_array(nr_pages, pages, 0);
> + if (ret) {
> + ret = -ENOMEM;
> + goto fail;
> + }
> +
> + priv = kmalloc(sizeof(*priv), GFP_NOFS);
> + if (!priv) {
> + ret = -ENOMEM;
> + goto fail;
> + }
> +
> + priv->iocb = *iocb;
> + priv->iov = iov;
> + priv->iter = *iter;
> + priv->count = count;
> + priv->cmd = cmd;
> + priv->cached_state = cached_state;
> + priv->compressed = compressed;
> + priv->nr_pages = nr_pages;
> + priv->pages = pages;
> + priv->start = start;
> + priv->lockend = lockend;
> + priv->err = 0;
> +
> + ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr,
> + disk_io_size, pages,
> + priv);
> + if (ret && ret != -EIOCBQUEUED)
> + goto fail;
Turning both into return EIOCBQUEUED is a bit suspicious, but
I lack context to say. Might make sense to return ret and let
the caller handle it.
> +
> + /*
> + * If we return -EIOCBQUEUED, we're deferring the cleanup to
> + * btrfs_uring_read_finished, which will handle unlocking the extent
> + * and inode and freeing the allocations.
> + */
> +
> + return -EIOCBQUEUED;
> +
> +fail:
> + unlock_extent(io_tree, start, lockend, &cached_state);
> + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
> + kfree(priv);
> + return ret;
> +}
> +
> +static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd,
> + unsigned int issue_flags)
> +{
> + size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args,
> + flags);
> + size_t copy_end;
> + struct btrfs_ioctl_encoded_io_args args = { 0 };
> + int ret;
> + u64 disk_bytenr, disk_io_size;
> + struct file *file = cmd->file;
> + struct btrfs_inode *inode = BTRFS_I(file->f_inode);
> + struct btrfs_fs_info *fs_info = inode->root->fs_info;
> + struct extent_io_tree *io_tree = &inode->io_tree;
> + struct iovec iovstack[UIO_FASTIOV];
> + struct iovec *iov = iovstack;
> + struct iov_iter iter;
> + loff_t pos;
> + struct kiocb kiocb;
> + struct extent_state *cached_state = NULL;
> + u64 start, lockend;
> + void __user *sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr));
Let's rename it, I was taken aback for a brief second why
you're copy_from_user() from an SQE / the ring, which turns
out to be a user pointer to a btrfs structure.
...
> + ret = btrfs_encoded_read(&kiocb, &iter, &args, &cached_state,
> + &disk_bytenr, &disk_io_size);
> + if (ret < 0 && ret != -EIOCBQUEUED)
> + goto out_free;
> +
> + file_accessed(file);
> +
> + if (copy_to_user(sqe_addr + copy_end, (char *)&args + copy_end_kernel,
> + sizeof(args) - copy_end_kernel)) {
> + if (ret == -EIOCBQUEUED) {
> + unlock_extent(io_tree, start, lockend, &cached_state);
> + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
> + }> + ret = -EFAULT;
> + goto out_free;
It seems we're saving iov in the priv structure, who can access the iovec
after the request is submitted? -EIOCBQUEUED in general means that the
request is submitted and will get completed async, e.g. via callback, and
if the bio callback can use the iov maybe via the iter, this goto will be
a use after free.
Also, you're returning -EFAULT back to io_uring, which will kill the
io_uring request / cmd while there might still be in flight bios that
can try to access it.
Can you inject errors into the copy and test please?
> + }
> +
> + if (ret == -EIOCBQUEUED) {
> + u64 count;
> +
> + /*
> + * If we've optimized things by storing the iovecs on the stack,
> + * undo this.
> + */> + if (!iov) {
> + iov = kmalloc(sizeof(struct iovec) * args.iovcnt,
> + GFP_NOFS);
> + if (!iov) {
> + unlock_extent(io_tree, start, lockend,
> + &cached_state);
> + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
> + ret = -ENOMEM;
> + goto out_acct;
> + }
> +
> + memcpy(iov, iovstack,
> + sizeof(struct iovec) * args.iovcnt);
> + }
> +
> + count = min_t(u64, iov_iter_count(&iter), disk_io_size);
> +
> + /* Match ioctl by not returning past EOF if uncompressed */
> + if (!args.compression)
> + count = min_t(u64, count, args.len);
> +
> + ret = btrfs_uring_read_extent(&kiocb, &iter, start, lockend,
> + cached_state, disk_bytenr,
> + disk_io_size, count,
> + args.compression, iov, cmd);
> +
> + goto out_acct;
> + }
> +
> +out_free:
> + kfree(iov);
> +
> +out_acct:
> + if (ret > 0)
> + add_rchar(current, ret);
> + inc_syscr(current);
> +
> + return ret;
> +}
--
Pavel Begunkov
next prev parent reply other threads:[~2024-10-30 0:59 UTC|newest]
Thread overview: 19+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-10-22 14:50 [PATCH v4 0/5] btrfs: io_uring interface for encoded reads Mark Harmstone
2024-10-22 14:50 ` [PATCH 1/5] btrfs: remove pointless addition in btrfs_encoded_read Mark Harmstone
2024-10-30 1:46 ` Anand Jain
2024-10-22 14:50 ` [PATCH 2/5] btrfs: change btrfs_encoded_read so that reading of extent is done by caller Mark Harmstone
2024-10-22 14:50 ` [PATCH 3/5] btrfs: don't sleep in btrfs_encoded_read if IOCB_NOWAIT set Mark Harmstone
2024-10-22 14:50 ` [PATCH 4/5] btrfs: move priv off stack in btrfs_encoded_read_regular_fill_pages Mark Harmstone
2024-10-22 14:50 ` [PATCH 5/5] btrfs: add io_uring command for encoded reads Mark Harmstone
2024-10-29 21:51 ` David Sterba
2024-10-30 0:59 ` Pavel Begunkov [this message]
2024-10-30 1:24 ` David Sterba
2024-10-30 2:32 ` Pavel Begunkov
2024-10-29 21:29 ` [PATCH v4 0/5] btrfs: io_uring interface " David Sterba
2024-10-30 1:22 ` Anand Jain
-- strict thread matches above, loose matches on Subject: below --
2024-10-14 17:18 [PATCH v3 0/5] btrfs: encoded reads via io_uring Mark Harmstone
2024-10-14 17:18 ` [PATCH 5/5] btrfs: add io_uring command for encoded reads Mark Harmstone
2024-10-21 13:50 ` David Sterba
2024-10-21 16:15 ` Pavel Begunkov
2024-10-21 17:05 ` Mark Harmstone
2024-10-21 18:23 ` David Sterba
2024-10-22 9:12 ` Mark Harmstone
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox