From: Jens Axboe <[email protected]>
To: Xiaoguang Wang <[email protected]>,
[email protected]
Cc: [email protected]
Subject: Re: [PATCH] io_uring: add io_uring_enter(2) fixed file support
Date: Thu, 3 Mar 2022 10:18:12 -0700 [thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>
On 3/3/22 9:31 AM, Jens Axboe wrote:
> On 3/3/22 7:40 AM, Jens Axboe wrote:
>> On 3/3/22 7:36 AM, Jens Axboe wrote:
>>> The only potential oddity here is that the fd passed back is not a
>>> legitimate fd. io_uring does support poll(2) on its file descriptor, so
>>> that could cause some confusion even if I don't think anyone actually
>>> does poll(2) on io_uring.
>>
>> Side note - the only implication here is that we then likely can't make
>> the optimized behavior the default, it has to be an IORING_SETUP_REG
>> flag which tells us that the application is aware of this limitation.
>> Though I guess close(2) might mess with that too... Hmm.
>
> Not sure I can find a good approach for that. Tried out your patch and
> made some fixes:
>
> - Missing free on final tctx free
> - Rename registered_files to registered_rings
> - Fix off-by-ones in checking max registration count
> - Use kcalloc
> - Rename ENTER_FIXED_FILE -> ENTER_REGISTERED_RING
> - Don't pass in tctx to io_uring_unreg_ringfd()
> - Get rid of forward declaration for adding tctx node
> - Get rid of extra file pointer in io_uring_enter()
> - Fix deadlock in io_ringfd_register()
> - Use io_uring_rsrc_update rather than add a new struct type
- Allow multiple register/unregister instead of enforcing just 1 at the
time
- Check for it really being a ring fd when registering
For different batch counts, nice improvements are seen. Roughly:
Batch==1 15% faster
Batch==2 13% faster
Batch==4 11% faster
This is just in microbenchmarks where the fdget/fdput play a bigger
factor, but it will certainly help with real world situations where
batching is more limited than in benchmarks.
diff --git a/fs/io_uring.c b/fs/io_uring.c
index ad3e0b0ab3b9..452e68b73e1f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -466,6 +466,8 @@ struct io_ring_ctx {
};
};
+#define IO_RINGFD_REG_MAX 16
+
struct io_uring_task {
/* submission side */
int cached_refs;
@@ -481,6 +483,7 @@ struct io_uring_task {
struct io_wq_work_list task_list;
struct io_wq_work_list prior_task_list;
struct callback_head task_work;
+ struct file **registered_rings;
bool task_running;
};
@@ -8806,8 +8809,16 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task,
if (unlikely(!tctx))
return -ENOMEM;
+ tctx->registered_rings = kcalloc(IO_RINGFD_REG_MAX,
+ sizeof(struct file *), GFP_KERNEL);
+ if (unlikely(!tctx->registered_rings)) {
+ kfree(tctx);
+ return -ENOMEM;
+ }
+
ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
if (unlikely(ret)) {
+ kfree(tctx->registered_rings);
kfree(tctx);
return ret;
}
@@ -8816,6 +8827,7 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task,
if (IS_ERR(tctx->io_wq)) {
ret = PTR_ERR(tctx->io_wq);
percpu_counter_destroy(&tctx->inflight);
+ kfree(tctx->registered_rings);
kfree(tctx);
return ret;
}
@@ -8840,6 +8852,7 @@ void __io_uring_free(struct task_struct *tsk)
WARN_ON_ONCE(tctx->io_wq);
WARN_ON_ONCE(tctx->cached_refs);
+ kfree(tctx->registered_rings);
percpu_counter_destroy(&tctx->inflight);
kfree(tctx);
tsk->io_uring = NULL;
@@ -10061,6 +10074,103 @@ void __io_uring_cancel(bool cancel_all)
io_uring_cancel_generic(cancel_all, NULL);
}
+void io_uring_unreg_ringfd(void)
+{
+ struct io_uring_task *tctx = current->io_uring;
+ int i;
+
+ for (i = 0; i < IO_RINGFD_REG_MAX; i++) {
+ if (tctx->registered_rings[i]) {
+ fput(tctx->registered_rings[i]);
+ tctx->registered_rings[i] = NULL;
+ }
+ }
+}
+
+static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *arg,
+ unsigned nr_args)
+{
+ struct io_uring_rsrc_update reg;
+ struct io_uring_task *tctx;
+ struct file *file;
+ int ret, nr, i;
+
+ if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
+ return -EINVAL;
+
+ mutex_unlock(&ctx->uring_lock);
+ ret = io_uring_add_tctx_node(ctx);
+ mutex_lock(&ctx->uring_lock);
+ if (ret)
+ return ret;
+
+ tctx = current->io_uring;
+ for (i = 0; i < nr_args; i++) {
+ if (copy_from_user(®, arg, sizeof(reg))) {
+ ret = -EFAULT;
+ break;
+ }
+ if (reg.offset >= IO_RINGFD_REG_MAX) {
+ ret = -EINVAL;
+ break;
+ }
+
+ if (tctx->registered_rings[reg.offset]) {
+ ret = -EBUSY;
+ break;
+ }
+ file = fget(reg.data);
+ if (!file) {
+ ret = -EBADF;
+ break;
+ } else if (file->f_op != &io_uring_fops) {
+ ret = -EOPNOTSUPP;
+ fput(file);
+ break;
+ }
+
+ tctx->registered_rings[reg.offset] = file;
+ arg += sizeof(reg);
+ nr++;
+ }
+
+ return nr ? nr : ret;
+}
+
+static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *arg,
+ unsigned nr_args)
+{
+ struct io_uring_task *tctx = current->io_uring;
+ struct io_uring_rsrc_update reg;
+ int ret, nr, i;
+
+ if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
+ return -EINVAL;
+ if (!tctx)
+ return 0;
+
+ ret = nr = 0;
+ for (i = 0; i < nr_args; i++) {
+ if (copy_from_user(®, arg, sizeof(reg))) {
+ ret = -EFAULT;
+ break;
+ }
+ if (reg.offset >= IO_RINGFD_REG_MAX) {
+ ret = -EINVAL;
+ break;
+ }
+
+ if (tctx->registered_rings[reg.offset]) {
+ fput(tctx->registered_rings[reg.offset]);
+ tctx->registered_rings[reg.offset] = NULL;
+ }
+ arg += sizeof(reg);
+ nr++;
+ }
+
+ return nr ? nr : ret;
+}
+
static void *io_uring_validate_mmap_request(struct file *file,
loff_t pgoff, size_t sz)
{
@@ -10191,12 +10301,23 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
io_run_task_work();
if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
- IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)))
+ IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
+ IORING_ENTER_REGISTERED_RING)))
return -EINVAL;
- f = fdget(fd);
- if (unlikely(!f.file))
- return -EBADF;
+ if (flags & IORING_ENTER_REGISTERED_RING) {
+ struct io_uring_task *tctx = current->io_uring;
+
+ if (fd >= IO_RINGFD_REG_MAX || !tctx)
+ return -EINVAL;
+ f.file = tctx->registered_rings[fd];
+ if (unlikely(!f.file))
+ return -EBADF;
+ } else {
+ f = fdget(fd);
+ if (unlikely(!f.file))
+ return -EBADF;
+ }
ret = -EOPNOTSUPP;
if (unlikely(f.file->f_op != &io_uring_fops))
@@ -10270,7 +10391,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
out:
percpu_ref_put(&ctx->refs);
out_fput:
- fdput(f);
+ if (!(flags & IORING_ENTER_REGISTERED_RING))
+ fdput(f);
return submitted ? submitted : ret;
}
@@ -11160,6 +11282,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
break;
ret = io_register_iowq_max_workers(ctx, arg);
break;
+ case IORING_REGISTER_IORINGFD:
+ ret = io_ringfd_register(ctx, arg, nr_args);
+ break;
+ case IORING_UNREGISTER_IORINGFD:
+ ret = io_ringfd_unregister(ctx, arg, nr_args);
+ break;
default:
ret = -EINVAL;
break;
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 649a4d7c241b..1814e698d861 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -9,11 +9,14 @@
struct sock *io_uring_get_socket(struct file *file);
void __io_uring_cancel(bool cancel_all);
void __io_uring_free(struct task_struct *tsk);
+void io_uring_unreg_ringfd(void);
static inline void io_uring_files_cancel(void)
{
- if (current->io_uring)
+ if (current->io_uring) {
+ io_uring_unreg_ringfd();
__io_uring_cancel(false);
+ }
}
static inline void io_uring_task_cancel(void)
{
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 787f491f0d2a..a36d31f2cc66 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -257,10 +257,11 @@ struct io_cqring_offsets {
/*
* io_uring_enter(2) flags
*/
-#define IORING_ENTER_GETEVENTS (1U << 0)
-#define IORING_ENTER_SQ_WAKEUP (1U << 1)
-#define IORING_ENTER_SQ_WAIT (1U << 2)
-#define IORING_ENTER_EXT_ARG (1U << 3)
+#define IORING_ENTER_GETEVENTS (1U << 0)
+#define IORING_ENTER_SQ_WAKEUP (1U << 1)
+#define IORING_ENTER_SQ_WAIT (1U << 2)
+#define IORING_ENTER_EXT_ARG (1U << 3)
+#define IORING_ENTER_REGISTERED_RING (1U << 4)
/*
* Passed in for io_uring_setup(2). Copied back with updated info on success
@@ -325,6 +326,10 @@ enum {
/* set/get max number of io-wq workers */
IORING_REGISTER_IOWQ_MAX_WORKERS = 19,
+ /* register/unregister io_uring fd with the ring */
+ IORING_REGISTER_IORINGFD = 20,
+ IORING_UNREGISTER_IORINGFD = 21,
+
/* this goes last */
IORING_REGISTER_LAST
};
--
Jens Axboe
next prev parent reply other threads:[~2022-03-03 17:18 UTC|newest]
Thread overview: 28+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-03-03 5:28 [PATCH] io_uring: add io_uring_enter(2) fixed file support Xiaoguang Wang
2022-03-03 8:56 ` Hao Xu
2022-03-03 13:38 ` Jens Axboe
2022-03-03 14:36 ` Jens Axboe
2022-03-03 14:40 ` Jens Axboe
2022-03-03 16:31 ` Jens Axboe
2022-03-03 17:18 ` Jens Axboe [this message]
2022-03-03 20:41 ` Jens Axboe
2022-03-03 21:19 ` Jens Axboe
2022-03-04 0:07 ` Jens Axboe
2022-03-04 13:39 ` Xiaoguang Wang
2022-03-04 13:44 ` Jens Axboe
2022-03-04 15:16 ` Xiaoguang Wang
2022-03-04 15:22 ` Jens Axboe
2022-03-08 8:38 ` Xiaoguang Wang
2022-03-08 13:10 ` Jens Axboe
2022-03-03 22:24 ` Vito Caputo
2022-03-03 22:26 ` Jens Axboe
2022-03-04 1:49 ` Pavel Begunkov
2022-03-04 2:18 ` Jens Axboe
2022-03-04 2:28 ` Pavel Begunkov
2022-03-04 2:35 ` Pavel Begunkov
2022-03-04 2:43 ` Jens Axboe
2022-03-04 1:52 ` Pavel Begunkov
2022-03-04 2:19 ` Jens Axboe
2022-03-04 2:39 ` Pavel Begunkov
2022-03-04 3:03 ` Jens Axboe
2022-04-21 14:16 ` Hao Xu
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox