public inbox for [email protected]
 help / color / mirror / Atom feed
From: Hao Xu <[email protected]>
To: Xiaoguang Wang <[email protected]>,
	[email protected]
Cc: [email protected], [email protected]
Subject: Re: [PATCH] io_uring: add io_uring_enter(2) fixed file support
Date: Thu, 3 Mar 2022 16:56:45 +0800	[thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>


On 3/3/22 13:28, Xiaoguang Wang wrote:
> IORING_REGISTER_FILES is a good feature to reduce fget/fput overhead for
> each IO we do on file, but still left one, which is io_uring_enter(2).
> In io_uring_enter(2), it still fget/fput io_ring fd. I have observed
> this overhead in some our internal oroutine implementations based on
> io_uring with low submit batch. To totally remove fget/fput overhead in
> io_uring, we may add a small struct file cache in io_uring_task and add
> a new IORING_ENTER_FIXED_FILE flag. Currently the capacity of this file
> cache is 16, wihcih I think it maybe enough, also not that this cache is
> per-thread.
>
> Signed-off-by: Xiaoguang Wang <[email protected]>
> ---
>   fs/io_uring.c                 | 127 +++++++++++++++++++++++++++++++++++++-----
>   include/linux/io_uring.h      |   5 +-
>   include/uapi/linux/io_uring.h |   9 +++
>   3 files changed, 126 insertions(+), 15 deletions(-)
>
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 77b9c7e4793b..e1d4b537cb60 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -461,6 +461,8 @@ struct io_ring_ctx {
>   	};
>   };
>   
> +#define IO_RINGFD_REG_MAX 16
> +
>   struct io_uring_task {
>   	/* submission side */
>   	int			cached_refs;
> @@ -476,6 +478,7 @@ struct io_uring_task {
>   	struct io_wq_work_list	task_list;
>   	struct io_wq_work_list	prior_task_list;
>   	struct callback_head	task_work;
> +	struct file		**registered_files;
>   	bool			task_running;
>   };
>   
> @@ -8739,8 +8742,16 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task,
>   	if (unlikely(!tctx))
>   		return -ENOMEM;
>   
> +	tctx->registered_files = kzalloc(sizeof(struct file *) * IO_RINGFD_REG_MAX,
> +					 GFP_KERNEL);
> +	if (unlikely(!tctx->registered_files)) {
> +		kfree(tctx);
> +		return -ENOMEM;
> +	}
> +
>   	ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
>   	if (unlikely(ret)) {
> +		kfree(tctx->registered_files);
>   		kfree(tctx);
>   		return ret;
>   	}
> @@ -8749,6 +8760,7 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task,
>   	if (IS_ERR(tctx->io_wq)) {
>   		ret = PTR_ERR(tctx->io_wq);
>   		percpu_counter_destroy(&tctx->inflight);
> +		kfree(tctx->registered_files);
>   		kfree(tctx);
>   		return ret;
>   	}
> @@ -9382,6 +9394,56 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx)
>   	return -ENXIO;
>   }
>   
> +static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx, bool locked);
> +
> +static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *arg)
> +{
> +	struct io_uring_fd_reg reg;
> +	struct io_uring_task *tctx;
> +	struct file *file;
> +	int ret;
> +
> +	if (copy_from_user(&reg, arg, sizeof(struct io_uring_fd_reg)))
> +		return -EFAULT;
> +	if (reg.offset > IO_RINGFD_REG_MAX)
should be >=
> +		return -EINVAL;
> +
> +	ret = io_uring_add_tctx_node(ctx, true);
> +	if (unlikely(ret))
> +		return ret;
> +
> +	tctx = current->io_uring;
> +	if (tctx->registered_files[reg.offset])
> +		return -EBUSY;
> +	file = fget(reg.fd);
> +	if (unlikely(!file))
> +		return -EBADF;
> +	tctx->registered_files[reg.offset] = file;
> +	return 0;
> +}
> +
> +static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *arg)
> +{
> +	struct io_uring_task *tctx;
> +	__u32 offset;
> +
> +	if (!current->io_uring)
> +		return 0;
> +
> +	if (copy_from_user(&offset, arg, sizeof(__u32)))
> +		return -EFAULT;
> +	if (offset > IO_RINGFD_REG_MAX)
ditto
> +		return -EINVAL;
> +
> +	tctx = current->io_uring;
> +	if (tctx->registered_files[offset]) {
> +		fput(tctx->registered_files[offset]);
> +		tctx->registered_files[offset] = NULL;
> +	}
> +
> +	return 0;
> +}
> +
>   static void io_destroy_buffers(struct io_ring_ctx *ctx)
>   {
>   	struct io_buffer *buf;
> @@ -9790,7 +9852,7 @@ static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
>   	}
>   }
>   
> -static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
> +static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx, bool locked)
>   {
>   	struct io_uring_task *tctx = current->io_uring;
>   	struct io_tctx_node *node;
> @@ -9825,9 +9887,11 @@ static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
>   			return ret;
>   		}
>   
> -		mutex_lock(&ctx->uring_lock);
> +		if (!locked)
> +			mutex_lock(&ctx->uring_lock);
>   		list_add(&node->ctx_node, &ctx->tctx_list);
> -		mutex_unlock(&ctx->uring_lock);
> +		if (!locked)
> +			mutex_unlock(&ctx->uring_lock);
>   	}
>   	tctx->last = ctx;
>   	return 0;
> @@ -9836,13 +9900,13 @@ static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
>   /*
>    * Note that this task has used io_uring. We use it for cancelation purposes.
>    */
> -static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
> +static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx, bool locked)
>   {
>   	struct io_uring_task *tctx = current->io_uring;
>   
>   	if (likely(tctx && tctx->last == ctx))
>   		return 0;
> -	return __io_uring_add_tctx_node(ctx);
> +	return __io_uring_add_tctx_node(ctx, locked);
>   }
>   
>   /*
> @@ -9973,6 +10037,16 @@ void __io_uring_cancel(bool cancel_all)
>   	io_uring_cancel_generic(cancel_all, NULL);
>   }
>   
> +void io_uring_unreg_ringfd(struct io_uring_task *tctx)
> +{
> +	int i;
> +
> +	for (i = 0; i < IO_RINGFD_REG_MAX; i++) {
> +		if (tctx->registered_files[i])
> +			fput(tctx->registered_files[i]);
> +	}
> +}
> +
>   static void *io_uring_validate_mmap_request(struct file *file,
>   					    loff_t pgoff, size_t sz)
>   {
> @@ -10098,24 +10172,36 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
>   	struct io_ring_ctx *ctx;
>   	int submitted = 0;
>   	struct fd f;
> +	struct file *file;
>   	long ret;
>   
>   	io_run_task_work();
>   
>   	if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
> -			       IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)))
> +			       IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
> +			       IORING_ENTER_FIXED_FILE)))
>   		return -EINVAL;
>   
> -	f = fdget(fd);
> -	if (unlikely(!f.file))
> -		return -EBADF;
> +	if (flags & IORING_ENTER_FIXED_FILE) {
> +		if (fd > IO_RINGFD_REG_MAX || !current->io_uring)
ditto
> +			return -EINVAL;
> +
> +		file = current->io_uring->registered_files[fd];
> +		if (unlikely(!file))
> +			return -EBADF;
> +	} else {
> +		f = fdget(fd);
> +		if (unlikely(!f.file))
> +			return -EBADF;
> +		file = f.file;
> +	}
>   
>   	ret = -EOPNOTSUPP;
> -	if (unlikely(f.file->f_op != &io_uring_fops))
> +	if (unlikely(file->f_op != &io_uring_fops))
>   		goto out_fput;
>   
>   	ret = -ENXIO;
> -	ctx = f.file->private_data;
> +	ctx = file->private_data;
>   	if (unlikely(!percpu_ref_tryget(&ctx->refs)))
>   		goto out_fput;
>   
> @@ -10145,7 +10231,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
>   		}
>   		submitted = to_submit;
>   	} else if (to_submit) {
> -		ret = io_uring_add_tctx_node(ctx);
> +		ret = io_uring_add_tctx_node(ctx, false);
>   		if (unlikely(ret))
>   			goto out;
>   		mutex_lock(&ctx->uring_lock);
> @@ -10182,7 +10268,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
>   out:
>   	percpu_ref_put(&ctx->refs);
>   out_fput:
> -	fdput(f);
> +	if (!(flags & IORING_ENTER_FIXED_FILE))
> +		fdput(f);
>   	return submitted ? submitted : ret;
>   }
>   
> @@ -10413,7 +10500,7 @@ static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
>   	if (fd < 0)
>   		return fd;
>   
> -	ret = io_uring_add_tctx_node(ctx);
> +	ret = io_uring_add_tctx_node(ctx, false);
>   	if (ret) {
>   		put_unused_fd(fd);
>   		return ret;
> @@ -11134,6 +11221,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
>   			break;
>   		ret = io_register_iowq_max_workers(ctx, arg);
>   		break;
> +	case IORING_REGISTER_IORINGFD:
> +		ret = -EINVAL;
> +		if (nr_args != 1)
> +			break;
> +		ret = io_ringfd_register(ctx, arg);
> +		break;
> +	case IORING_UNREGISTER_IORINGFD:
> +		ret = -EINVAL;
> +		if (nr_args != 1)
> +			break;
> +		ret = io_ringfd_unregister(ctx, arg);
> +		break;
>   	default:
>   		ret = -EINVAL;
>   		break;
> diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
> index 649a4d7c241b..5ddea83912c7 100644
> --- a/include/linux/io_uring.h
> +++ b/include/linux/io_uring.h
> @@ -9,11 +9,14 @@
>   struct sock *io_uring_get_socket(struct file *file);
>   void __io_uring_cancel(bool cancel_all);
>   void __io_uring_free(struct task_struct *tsk);
> +void io_uring_unreg_ringfd(struct io_uring_task *tctx);
>   
>   static inline void io_uring_files_cancel(void)
>   {
> -	if (current->io_uring)
> +	if (current->io_uring) {
> +		io_uring_unreg_ringfd(current->io_uring);
>   		__io_uring_cancel(false);
> +	}
>   }
>   static inline void io_uring_task_cancel(void)
>   {
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index 787f491f0d2a..f076a203317a 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -261,6 +261,7 @@ struct io_cqring_offsets {
>   #define IORING_ENTER_SQ_WAKEUP	(1U << 1)
>   #define IORING_ENTER_SQ_WAIT	(1U << 2)
>   #define IORING_ENTER_EXT_ARG	(1U << 3)
> +#define IORING_ENTER_FIXED_FILE	(1U << 4)
>   
>   /*
>    * Passed in for io_uring_setup(2). Copied back with updated info on success
> @@ -325,6 +326,9 @@ enum {
>   	/* set/get max number of io-wq workers */
>   	IORING_REGISTER_IOWQ_MAX_WORKERS	= 19,
>   
> +	IORING_REGISTER_IORINGFD		= 20,
> +	IORING_UNREGISTER_IORINGFD		= 21,
> +
>   	/* this goes last */
>   	IORING_REGISTER_LAST
>   };
> @@ -422,4 +426,9 @@ struct io_uring_getevents_arg {
>   	__u64	ts;
>   };
>   
> +struct io_uring_fd_reg {
> +	__u32	offset;
> +	__s32	fd;
> +};
> +
>   #endif

  reply	other threads:[~2022-03-03  8:56 UTC|newest]

Thread overview: 28+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-03-03  5:28 [PATCH] io_uring: add io_uring_enter(2) fixed file support Xiaoguang Wang
2022-03-03  8:56 ` Hao Xu [this message]
2022-03-03 13:38 ` Jens Axboe
2022-03-03 14:36   ` Jens Axboe
2022-03-03 14:40     ` Jens Axboe
2022-03-03 16:31       ` Jens Axboe
2022-03-03 17:18         ` Jens Axboe
2022-03-03 20:41           ` Jens Axboe
2022-03-03 21:19             ` Jens Axboe
2022-03-04  0:07               ` Jens Axboe
2022-03-04 13:39                 ` Xiaoguang Wang
2022-03-04 13:44                   ` Jens Axboe
2022-03-04 15:16                     ` Xiaoguang Wang
2022-03-04 15:22                       ` Jens Axboe
2022-03-08  8:38                         ` Xiaoguang Wang
2022-03-08 13:10                           ` Jens Axboe
2022-03-03 22:24             ` Vito Caputo
2022-03-03 22:26               ` Jens Axboe
2022-03-04  1:49         ` Pavel Begunkov
2022-03-04  2:18           ` Jens Axboe
2022-03-04  2:28             ` Pavel Begunkov
2022-03-04  2:35               ` Pavel Begunkov
2022-03-04  2:43               ` Jens Axboe
2022-03-04  1:52         ` Pavel Begunkov
2022-03-04  2:19           ` Jens Axboe
2022-03-04  2:39             ` Pavel Begunkov
2022-03-04  3:03               ` Jens Axboe
2022-04-21 14:16     ` Hao Xu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=08e460c0-7a2f-a60e-adc3-69ce470cd07d@linux.alibaba.com \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox