From: Hao Xu <[email protected]>
To: Xiaoguang Wang <[email protected]>,
[email protected]
Cc: [email protected], [email protected]
Subject: Re: [PATCH] io_uring: add io_uring_enter(2) fixed file support
Date: Thu, 3 Mar 2022 16:56:45 +0800 [thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>
On 3/3/22 13:28, Xiaoguang Wang wrote:
> IORING_REGISTER_FILES is a good feature to reduce fget/fput overhead for
> each IO we do on file, but still left one, which is io_uring_enter(2).
> In io_uring_enter(2), it still fget/fput io_ring fd. I have observed
> this overhead in some our internal oroutine implementations based on
> io_uring with low submit batch. To totally remove fget/fput overhead in
> io_uring, we may add a small struct file cache in io_uring_task and add
> a new IORING_ENTER_FIXED_FILE flag. Currently the capacity of this file
> cache is 16, wihcih I think it maybe enough, also not that this cache is
> per-thread.
>
> Signed-off-by: Xiaoguang Wang <[email protected]>
> ---
> fs/io_uring.c | 127 +++++++++++++++++++++++++++++++++++++-----
> include/linux/io_uring.h | 5 +-
> include/uapi/linux/io_uring.h | 9 +++
> 3 files changed, 126 insertions(+), 15 deletions(-)
>
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 77b9c7e4793b..e1d4b537cb60 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -461,6 +461,8 @@ struct io_ring_ctx {
> };
> };
>
> +#define IO_RINGFD_REG_MAX 16
> +
> struct io_uring_task {
> /* submission side */
> int cached_refs;
> @@ -476,6 +478,7 @@ struct io_uring_task {
> struct io_wq_work_list task_list;
> struct io_wq_work_list prior_task_list;
> struct callback_head task_work;
> + struct file **registered_files;
> bool task_running;
> };
>
> @@ -8739,8 +8742,16 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task,
> if (unlikely(!tctx))
> return -ENOMEM;
>
> + tctx->registered_files = kzalloc(sizeof(struct file *) * IO_RINGFD_REG_MAX,
> + GFP_KERNEL);
> + if (unlikely(!tctx->registered_files)) {
> + kfree(tctx);
> + return -ENOMEM;
> + }
> +
> ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
> if (unlikely(ret)) {
> + kfree(tctx->registered_files);
> kfree(tctx);
> return ret;
> }
> @@ -8749,6 +8760,7 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task,
> if (IS_ERR(tctx->io_wq)) {
> ret = PTR_ERR(tctx->io_wq);
> percpu_counter_destroy(&tctx->inflight);
> + kfree(tctx->registered_files);
> kfree(tctx);
> return ret;
> }
> @@ -9382,6 +9394,56 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx)
> return -ENXIO;
> }
>
> +static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx, bool locked);
> +
> +static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *arg)
> +{
> + struct io_uring_fd_reg reg;
> + struct io_uring_task *tctx;
> + struct file *file;
> + int ret;
> +
> + if (copy_from_user(®, arg, sizeof(struct io_uring_fd_reg)))
> + return -EFAULT;
> + if (reg.offset > IO_RINGFD_REG_MAX)
should be >=
> + return -EINVAL;
> +
> + ret = io_uring_add_tctx_node(ctx, true);
> + if (unlikely(ret))
> + return ret;
> +
> + tctx = current->io_uring;
> + if (tctx->registered_files[reg.offset])
> + return -EBUSY;
> + file = fget(reg.fd);
> + if (unlikely(!file))
> + return -EBADF;
> + tctx->registered_files[reg.offset] = file;
> + return 0;
> +}
> +
> +static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *arg)
> +{
> + struct io_uring_task *tctx;
> + __u32 offset;
> +
> + if (!current->io_uring)
> + return 0;
> +
> + if (copy_from_user(&offset, arg, sizeof(__u32)))
> + return -EFAULT;
> + if (offset > IO_RINGFD_REG_MAX)
ditto
> + return -EINVAL;
> +
> + tctx = current->io_uring;
> + if (tctx->registered_files[offset]) {
> + fput(tctx->registered_files[offset]);
> + tctx->registered_files[offset] = NULL;
> + }
> +
> + return 0;
> +}
> +
> static void io_destroy_buffers(struct io_ring_ctx *ctx)
> {
> struct io_buffer *buf;
> @@ -9790,7 +9852,7 @@ static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
> }
> }
>
> -static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
> +static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx, bool locked)
> {
> struct io_uring_task *tctx = current->io_uring;
> struct io_tctx_node *node;
> @@ -9825,9 +9887,11 @@ static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
> return ret;
> }
>
> - mutex_lock(&ctx->uring_lock);
> + if (!locked)
> + mutex_lock(&ctx->uring_lock);
> list_add(&node->ctx_node, &ctx->tctx_list);
> - mutex_unlock(&ctx->uring_lock);
> + if (!locked)
> + mutex_unlock(&ctx->uring_lock);
> }
> tctx->last = ctx;
> return 0;
> @@ -9836,13 +9900,13 @@ static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
> /*
> * Note that this task has used io_uring. We use it for cancelation purposes.
> */
> -static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
> +static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx, bool locked)
> {
> struct io_uring_task *tctx = current->io_uring;
>
> if (likely(tctx && tctx->last == ctx))
> return 0;
> - return __io_uring_add_tctx_node(ctx);
> + return __io_uring_add_tctx_node(ctx, locked);
> }
>
> /*
> @@ -9973,6 +10037,16 @@ void __io_uring_cancel(bool cancel_all)
> io_uring_cancel_generic(cancel_all, NULL);
> }
>
> +void io_uring_unreg_ringfd(struct io_uring_task *tctx)
> +{
> + int i;
> +
> + for (i = 0; i < IO_RINGFD_REG_MAX; i++) {
> + if (tctx->registered_files[i])
> + fput(tctx->registered_files[i]);
> + }
> +}
> +
> static void *io_uring_validate_mmap_request(struct file *file,
> loff_t pgoff, size_t sz)
> {
> @@ -10098,24 +10172,36 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
> struct io_ring_ctx *ctx;
> int submitted = 0;
> struct fd f;
> + struct file *file;
> long ret;
>
> io_run_task_work();
>
> if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
> - IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)))
> + IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
> + IORING_ENTER_FIXED_FILE)))
> return -EINVAL;
>
> - f = fdget(fd);
> - if (unlikely(!f.file))
> - return -EBADF;
> + if (flags & IORING_ENTER_FIXED_FILE) {
> + if (fd > IO_RINGFD_REG_MAX || !current->io_uring)
ditto
> + return -EINVAL;
> +
> + file = current->io_uring->registered_files[fd];
> + if (unlikely(!file))
> + return -EBADF;
> + } else {
> + f = fdget(fd);
> + if (unlikely(!f.file))
> + return -EBADF;
> + file = f.file;
> + }
>
> ret = -EOPNOTSUPP;
> - if (unlikely(f.file->f_op != &io_uring_fops))
> + if (unlikely(file->f_op != &io_uring_fops))
> goto out_fput;
>
> ret = -ENXIO;
> - ctx = f.file->private_data;
> + ctx = file->private_data;
> if (unlikely(!percpu_ref_tryget(&ctx->refs)))
> goto out_fput;
>
> @@ -10145,7 +10231,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
> }
> submitted = to_submit;
> } else if (to_submit) {
> - ret = io_uring_add_tctx_node(ctx);
> + ret = io_uring_add_tctx_node(ctx, false);
> if (unlikely(ret))
> goto out;
> mutex_lock(&ctx->uring_lock);
> @@ -10182,7 +10268,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
> out:
> percpu_ref_put(&ctx->refs);
> out_fput:
> - fdput(f);
> + if (!(flags & IORING_ENTER_FIXED_FILE))
> + fdput(f);
> return submitted ? submitted : ret;
> }
>
> @@ -10413,7 +10500,7 @@ static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
> if (fd < 0)
> return fd;
>
> - ret = io_uring_add_tctx_node(ctx);
> + ret = io_uring_add_tctx_node(ctx, false);
> if (ret) {
> put_unused_fd(fd);
> return ret;
> @@ -11134,6 +11221,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
> break;
> ret = io_register_iowq_max_workers(ctx, arg);
> break;
> + case IORING_REGISTER_IORINGFD:
> + ret = -EINVAL;
> + if (nr_args != 1)
> + break;
> + ret = io_ringfd_register(ctx, arg);
> + break;
> + case IORING_UNREGISTER_IORINGFD:
> + ret = -EINVAL;
> + if (nr_args != 1)
> + break;
> + ret = io_ringfd_unregister(ctx, arg);
> + break;
> default:
> ret = -EINVAL;
> break;
> diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
> index 649a4d7c241b..5ddea83912c7 100644
> --- a/include/linux/io_uring.h
> +++ b/include/linux/io_uring.h
> @@ -9,11 +9,14 @@
> struct sock *io_uring_get_socket(struct file *file);
> void __io_uring_cancel(bool cancel_all);
> void __io_uring_free(struct task_struct *tsk);
> +void io_uring_unreg_ringfd(struct io_uring_task *tctx);
>
> static inline void io_uring_files_cancel(void)
> {
> - if (current->io_uring)
> + if (current->io_uring) {
> + io_uring_unreg_ringfd(current->io_uring);
> __io_uring_cancel(false);
> + }
> }
> static inline void io_uring_task_cancel(void)
> {
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index 787f491f0d2a..f076a203317a 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -261,6 +261,7 @@ struct io_cqring_offsets {
> #define IORING_ENTER_SQ_WAKEUP (1U << 1)
> #define IORING_ENTER_SQ_WAIT (1U << 2)
> #define IORING_ENTER_EXT_ARG (1U << 3)
> +#define IORING_ENTER_FIXED_FILE (1U << 4)
>
> /*
> * Passed in for io_uring_setup(2). Copied back with updated info on success
> @@ -325,6 +326,9 @@ enum {
> /* set/get max number of io-wq workers */
> IORING_REGISTER_IOWQ_MAX_WORKERS = 19,
>
> + IORING_REGISTER_IORINGFD = 20,
> + IORING_UNREGISTER_IORINGFD = 21,
> +
> /* this goes last */
> IORING_REGISTER_LAST
> };
> @@ -422,4 +426,9 @@ struct io_uring_getevents_arg {
> __u64 ts;
> };
>
> +struct io_uring_fd_reg {
> + __u32 offset;
> + __s32 fd;
> +};
> +
> #endif
next prev parent reply other threads:[~2022-03-03 8:56 UTC|newest]
Thread overview: 28+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-03-03 5:28 [PATCH] io_uring: add io_uring_enter(2) fixed file support Xiaoguang Wang
2022-03-03 8:56 ` Hao Xu [this message]
2022-03-03 13:38 ` Jens Axboe
2022-03-03 14:36 ` Jens Axboe
2022-03-03 14:40 ` Jens Axboe
2022-03-03 16:31 ` Jens Axboe
2022-03-03 17:18 ` Jens Axboe
2022-03-03 20:41 ` Jens Axboe
2022-03-03 21:19 ` Jens Axboe
2022-03-04 0:07 ` Jens Axboe
2022-03-04 13:39 ` Xiaoguang Wang
2022-03-04 13:44 ` Jens Axboe
2022-03-04 15:16 ` Xiaoguang Wang
2022-03-04 15:22 ` Jens Axboe
2022-03-08 8:38 ` Xiaoguang Wang
2022-03-08 13:10 ` Jens Axboe
2022-03-03 22:24 ` Vito Caputo
2022-03-03 22:26 ` Jens Axboe
2022-03-04 1:49 ` Pavel Begunkov
2022-03-04 2:18 ` Jens Axboe
2022-03-04 2:28 ` Pavel Begunkov
2022-03-04 2:35 ` Pavel Begunkov
2022-03-04 2:43 ` Jens Axboe
2022-03-04 1:52 ` Pavel Begunkov
2022-03-04 2:19 ` Jens Axboe
2022-03-04 2:39 ` Pavel Begunkov
2022-03-04 3:03 ` Jens Axboe
2022-04-21 14:16 ` Hao Xu
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=08e460c0-7a2f-a60e-adc3-69ce470cd07d@linux.alibaba.com \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox