From: Anuj gupta <[email protected]>
To: Jens Axboe <[email protected]>
Cc: [email protected]
Subject: Re: [PATCH 10/17] io_uring/rw: always setup io_async_rw for read/write requests
Date: Mon, 25 Mar 2024 17:33:09 +0530 [thread overview]
Message-ID: <CACzX3AvbFtCAH8Lr_zsNjQeMMhrRFdrmLcE=zRygWe61nL5YAA@mail.gmail.com> (raw)
In-Reply-To: <[email protected]>
On Thu, Mar 21, 2024 at 4:28 AM Jens Axboe <[email protected]> wrote:
>
> read/write requests try to put everything on the stack, and then alloc
> and copy if we need to retry. This necessitates a bunch of nasty code
> that deals with intermediate state.
>
> Get rid of this, and have the prep side setup everything we need
> upfront, which greatly simplifies the opcode handlers.
>
> This includes adding an alloc cache for io_async_rw, to make it cheap
> to handle.
>
> In terms of cost, this should be basically free and transparent. For
> the worst case of {READ,WRITE}_FIXED which didn't need it before,
> performance is unaffected in the normal peak workload that is being
> used to test that. Still runs at 122M IOPS.
>
> Signed-off-by: Jens Axboe <[email protected]>
> ---
> include/linux/io_uring_types.h | 1 +
> io_uring/io_uring.c | 3 +
> io_uring/opdef.c | 15 +-
> io_uring/rw.c | 538 ++++++++++++++++-----------------
> io_uring/rw.h | 19 +-
> 5 files changed, 278 insertions(+), 298 deletions(-)
>
> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
> index f37caff64d05..2ba8676f83cc 100644
> --- a/include/linux/io_uring_types.h
> +++ b/include/linux/io_uring_types.h
> @@ -300,6 +300,7 @@ struct io_ring_ctx {
> struct io_hash_table cancel_table_locked;
> struct io_alloc_cache apoll_cache;
> struct io_alloc_cache netmsg_cache;
> + struct io_alloc_cache rw_cache;
>
> /*
> * Any cancelable uring_cmd is added to this list in
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index ff0e233ce3c9..cc8ce830ff4b 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -308,6 +308,8 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
> sizeof(struct async_poll));
> io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX,
> sizeof(struct io_async_msghdr));
> + io_alloc_cache_init(&ctx->rw_cache, IO_ALLOC_CACHE_MAX,
> + sizeof(struct io_async_rw));
> io_futex_cache_init(ctx);
> init_completion(&ctx->ref_comp);
> xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
> @@ -2898,6 +2900,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
> io_eventfd_unregister(ctx);
> io_alloc_cache_free(&ctx->apoll_cache, io_apoll_cache_free);
> io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
> + io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free);
> io_futex_cache_free(ctx);
> io_destroy_buffers(ctx);
> mutex_unlock(&ctx->uring_lock);
> diff --git a/io_uring/opdef.c b/io_uring/opdef.c
> index dd4a1e1425e1..fcae75a08f2c 100644
> --- a/io_uring/opdef.c
> +++ b/io_uring/opdef.c
> @@ -67,7 +67,7 @@ const struct io_issue_def io_issue_defs[] = {
> .iopoll = 1,
> .iopoll_queue = 1,
> .vectored = 1,
> - .prep = io_prep_rwv,
> + .prep = io_prep_readv,
> .issue = io_read,
> },
> [IORING_OP_WRITEV] = {
> @@ -81,7 +81,7 @@ const struct io_issue_def io_issue_defs[] = {
> .iopoll = 1,
> .iopoll_queue = 1,
> .vectored = 1,
> - .prep = io_prep_rwv,
> + .prep = io_prep_writev,
> .issue = io_write,
> },
> [IORING_OP_FSYNC] = {
> @@ -99,7 +99,7 @@ const struct io_issue_def io_issue_defs[] = {
> .ioprio = 1,
> .iopoll = 1,
> .iopoll_queue = 1,
> - .prep = io_prep_rw_fixed,
> + .prep = io_prep_read_fixed,
> .issue = io_read,
> },
> [IORING_OP_WRITE_FIXED] = {
> @@ -112,7 +112,7 @@ const struct io_issue_def io_issue_defs[] = {
> .ioprio = 1,
> .iopoll = 1,
> .iopoll_queue = 1,
> - .prep = io_prep_rw_fixed,
> + .prep = io_prep_write_fixed,
> .issue = io_write,
> },
> [IORING_OP_POLL_ADD] = {
> @@ -239,7 +239,7 @@ const struct io_issue_def io_issue_defs[] = {
> .ioprio = 1,
> .iopoll = 1,
> .iopoll_queue = 1,
> - .prep = io_prep_rw,
> + .prep = io_prep_read,
> .issue = io_read,
> },
> [IORING_OP_WRITE] = {
> @@ -252,7 +252,7 @@ const struct io_issue_def io_issue_defs[] = {
> .ioprio = 1,
> .iopoll = 1,
> .iopoll_queue = 1,
> - .prep = io_prep_rw,
> + .prep = io_prep_write,
> .issue = io_write,
> },
> [IORING_OP_FADVISE] = {
> @@ -490,14 +490,12 @@ const struct io_cold_def io_cold_defs[] = {
> [IORING_OP_READV] = {
> .async_size = sizeof(struct io_async_rw),
> .name = "READV",
> - .prep_async = io_readv_prep_async,
> .cleanup = io_readv_writev_cleanup,
> .fail = io_rw_fail,
> },
> [IORING_OP_WRITEV] = {
> .async_size = sizeof(struct io_async_rw),
> .name = "WRITEV",
> - .prep_async = io_writev_prep_async,
> .cleanup = io_readv_writev_cleanup,
> .fail = io_rw_fail,
> },
> @@ -699,6 +697,7 @@ const struct io_cold_def io_cold_defs[] = {
> #endif
> },
> [IORING_OP_READ_MULTISHOT] = {
> + .async_size = sizeof(struct io_async_rw),
> .name = "READ_MULTISHOT",
> },
> [IORING_OP_WAITID] = {
> diff --git a/io_uring/rw.c b/io_uring/rw.c
> index 35216e8adc29..583fe61a0acb 100644
> --- a/io_uring/rw.c
> +++ b/io_uring/rw.c
> @@ -75,7 +75,153 @@ static int io_iov_buffer_select_prep(struct io_kiocb *req)
> return 0;
> }
>
> -int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
> +static int __io_import_iovec(int ddir, struct io_kiocb *req,
> + struct io_async_rw *io,
> + unsigned int issue_flags)
> +{
> + const struct io_issue_def *def = &io_issue_defs[req->opcode];
> + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
> + void __user *buf;
> + size_t sqe_len;
> +
> + buf = u64_to_user_ptr(rw->addr);
> + sqe_len = rw->len;
> +
> + if (!def->vectored || req->flags & REQ_F_BUFFER_SELECT) {
> + if (io_do_buffer_select(req)) {
> + buf = io_buffer_select(req, &sqe_len, issue_flags);
> + if (!buf)
> + return -ENOBUFS;
> + rw->addr = (unsigned long) buf;
> + rw->len = sqe_len;
> + }
> +
> + return import_ubuf(ddir, buf, sqe_len, &io->s.iter);
> + }
> +
> + io->free_iovec = io->s.fast_iov;
> + return __import_iovec(ddir, buf, sqe_len, UIO_FASTIOV, &io->free_iovec,
> + &io->s.iter, req->ctx->compat);
> +}
> +
> +static inline int io_import_iovec(int rw, struct io_kiocb *req,
> + struct io_async_rw *io,
> + unsigned int issue_flags)
> +{
> + int ret;
> +
> + ret = __io_import_iovec(rw, req, io, issue_flags);
> + if (unlikely(ret < 0))
> + return ret;
> +
> + iov_iter_save_state(&io->s.iter, &io->s.iter_state);
> + return 0;
> +}
> +
> +static void io_rw_iovec_free(struct io_async_rw *rw)
> +{
> + if (rw->free_iovec) {
> + kfree(rw->free_iovec);
> + rw->free_iovec = NULL;
> + }
> +}
> +
> +static void io_rw_recycle(struct io_kiocb *req, unsigned int issue_flags)
> +{
> + struct io_async_rw *rw = req->async_data;
> +
> + if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
> + io_rw_iovec_free(rw);
> + return;
> + }
> + if (io_alloc_cache_put(&req->ctx->rw_cache, &rw->cache)) {
> + req->async_data = NULL;
> + req->flags &= ~REQ_F_ASYNC_DATA;
> + }
> +}
> +
> +static void io_req_rw_cleanup(struct io_kiocb *req, unsigned int issue_flags)
> +{
> + /*
> + * Disable quick recycling for anything that's gone through io-wq.
> + * In theory, this should be fine to cleanup. However, some read or
> + * write iter handling touches the iovec AFTER having called into the
> + * handler, eg to reexpand or revert. This means we can have:
> + *
> + * task io-wq
> + * issue
> + * punt to io-wq
> + * issue
> + * blkdev_write_iter()
> + * ->ki_complete()
> + * io_complete_rw()
> + * queue tw complete
> + * run tw
> + * req_rw_cleanup
> + * iov_iter_count() <- look at iov_iter again
> + *
> + * which can lead to a UAF. This is only possible for io-wq offload
> + * as the cleanup can run in parallel. As io-wq is not the fast path,
> + * just leave cleanup to the end.
> + *
> + * This is really a bug in the core code that does this, any issue
> + * path should assume that a successful (or -EIOCBQUEUED) return can
> + * mean that the underlying data can be gone at any time. But that
> + * should be fixed seperately, and then this check could be killed.
> + */
> + if (!(req->flags & REQ_F_REFCOUNT)) {
> + req->flags &= ~REQ_F_NEED_CLEANUP;
> + io_rw_recycle(req, issue_flags);
> + }
> +}
> +
> +static int io_rw_alloc_async(struct io_kiocb *req)
> +{
> + struct io_ring_ctx *ctx = req->ctx;
> + struct io_cache_entry *entry;
> + struct io_async_rw *rw;
> +
> + entry = io_alloc_cache_get(&ctx->rw_cache);
> + if (entry) {
> + rw = container_of(entry, struct io_async_rw, cache);
> + req->flags |= REQ_F_ASYNC_DATA;
> + req->async_data = rw;
> + goto done;
> + }
> +
> + if (!io_alloc_async_data(req)) {
> + rw = req->async_data;
> +done:
> + rw->free_iovec = NULL;
> + rw->bytes_done = 0;
> + return 0;
> + }
> +
> + return -ENOMEM;
> +}
> +
> +static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import)
> +{
> + struct io_async_rw *rw;
> + int ret;
> +
> + if (io_rw_alloc_async(req))
> + return -ENOMEM;
> +
> + if (!do_import || io_do_buffer_select(req))
> + return 0;
> +
> + rw = req->async_data;
> + ret = io_import_iovec(ddir, req, rw, 0);
> + if (unlikely(ret < 0))
> + return ret;
> +
> + iov_iter_save_state(&rw->s.iter, &rw->s.iter_state);
It seems that the state of iov_iter gets saved in the caller io_import_iovec
as well. Do we need to save it again here?
--
Anuj Gupta
next prev parent reply other threads:[~2024-03-25 12:03 UTC|newest]
Thread overview: 30+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-03-20 22:55 [PATCHSET v2 0/17] Improve async state handling Jens Axboe
2024-03-20 22:55 ` [PATCH 01/17] io_uring/net: switch io_send() and io_send_zc() to using io_async_msghdr Jens Axboe
2024-04-06 20:58 ` Pavel Begunkov
2024-04-07 21:47 ` Jens Axboe
2024-03-20 22:55 ` [PATCH 02/17] io_uring/net: switch io_recv() " Jens Axboe
2024-03-20 22:55 ` [PATCH 03/17] io_uring/net: unify cleanup handling Jens Axboe
2024-03-20 22:55 ` [PATCH 04/17] io_uring/net: always setup an io_async_msghdr Jens Axboe
2024-03-20 22:55 ` [PATCH 05/17] io_uring/net: get rid of ->prep_async() for receive side Jens Axboe
2024-03-20 22:55 ` [PATCH 06/17] io_uring/net: get rid of ->prep_async() for send side Jens Axboe
2024-03-20 22:55 ` [PATCH 07/17] io_uring: kill io_msg_alloc_async_prep() Jens Axboe
2024-03-20 22:55 ` [PATCH 08/17] io_uring/net: add iovec recycling Jens Axboe
2024-03-20 22:55 ` [PATCH 09/17] io_uring/net: drop 'kmsg' parameter from io_req_msg_cleanup() Jens Axboe
2024-03-20 22:55 ` [PATCH 10/17] io_uring/rw: always setup io_async_rw for read/write requests Jens Axboe
2024-03-25 12:03 ` Anuj gupta [this message]
2024-03-25 14:54 ` Jens Axboe
2024-03-20 22:55 ` [PATCH 11/17] io_uring: get rid of struct io_rw_state Jens Axboe
2024-03-20 22:55 ` [PATCH 12/17] io_uring/rw: add iovec recycling Jens Axboe
2024-03-20 22:55 ` [PATCH 13/17] io_uring/net: move connect to always using async data Jens Axboe
2024-03-20 22:55 ` [PATCH 14/17] io_uring/uring_cmd: switch to always allocating " Jens Axboe
2024-03-20 22:55 ` [PATCH 15/17] io_uring/uring_cmd: defer SQE copying until we need it Jens Axboe
2024-03-25 12:41 ` Anuj gupta
2024-03-25 14:55 ` Jens Axboe
2024-03-20 22:55 ` [PATCH 16/17] io_uring: drop ->prep_async() Jens Axboe
2024-04-06 20:54 ` Pavel Begunkov
2024-04-07 21:46 ` Jens Axboe
2024-03-20 22:55 ` [PATCH 17/17] io_uring/alloc_cache: switch to array based caching Jens Axboe
2024-03-21 15:59 ` Gabriel Krisman Bertazi
2024-03-21 16:38 ` Jens Axboe
2024-03-21 17:20 ` Gabriel Krisman Bertazi
2024-03-21 17:22 ` Jens Axboe
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to='CACzX3AvbFtCAH8Lr_zsNjQeMMhrRFdrmLcE=zRygWe61nL5YAA@mail.gmail.com' \
[email protected] \
[email protected] \
[email protected] \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox