* [RFC] io_uring: add fixed poll support
@ 2021-10-09 7:56 Xiaoguang Wang
2021-10-21 7:07 ` Xiaoguang Wang
2021-10-21 14:49 ` Jens Axboe
0 siblings, 2 replies; 3+ messages in thread
From: Xiaoguang Wang @ 2021-10-09 7:56 UTC (permalink / raw)
To: io-uring; +Cc: axboe, asml.silence
Recently I spend time to research io_uring's fast-poll and multi-shot's
performance using network echo-server model. Previously I always thought
fast-poll is better than multi-shot and will give better performance,
but indeed multi-shot is almost always better than fast-poll in real
test, which is very interesting. I use ebpf to have some measurements,
it shows that whether fast-poll is excellent or not depends entirely on
that the first nowait try in io_issue_sqe() succeeds or fails. Take
io_recv operation as example(recv buffer is 16 bytes):
1) the first nowait succeeds, a simple io_recv() is enough.
In my test machine, successful io_recv() consumes 1110ns averagely.
2) the first nowait fails, then we'll have some expensive work, which
contains failed io_revc(), apoll allocations, vfs_poll(), miscellaneous
initializations anc check in __io_arm_poll_handler() and a final
successful io_recv(). Among then:
failed io_revc() consumes 620ns averagely.
vfs_poll() consumes 550ns averagely.
I don't measure other overhead yet, but we can see if the first nowait
try fails, we'll need at least 2290ns(620 + 550 + 1110) to complete it.
In my echo server tests, 40% of first nowait io_recv() operations fails.
From above measurements, it can explain why mulit-shot is better than
multi-shot, mulit-shot can ensure the first nowait try succeed.
Based on above measurements, I try to improve fast-poll a bit:
1. introduce fix poll support, currently it only works in file
registered mode. With this feature, we can get rid of various repeated
operations in io_arm_poll_handler(), contains apoll allocations,
and miscellaneous initializations anc check.
2. introduce an event generation, which will increase monotonically.
If there is no new event happen, we don't need to call vfs_poll(), just
put req in a waitting list.
With this patch, echo server tests(1000 connection, packet is 16 bytes)
shows 1%~2% tps improvement.
Signed-off-by: Xiaoguang Wang <[email protected]>
---
fs/io_uring.c | 258 +++++++++++++++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 245 insertions(+), 13 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8317c360f7a4..71df37c1eea6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -216,9 +216,20 @@ struct io_overflow_cqe {
struct list_head list;
};
+struct io_fixed_poll {
+ struct wait_queue_head *head;
+ struct list_head list;
+ struct wait_queue_entry wait;
+ unsigned long generation;
+ __poll_t events;
+};
+
+
struct io_fixed_file {
/* file * with additional FFS_* flags */
unsigned long file_ptr;
+ struct io_fixed_poll *rpoll;
+ struct io_fixed_poll *wpoll;
};
struct io_rsrc_put {
@@ -226,7 +237,7 @@ struct io_rsrc_put {
u64 tag;
union {
void *rsrc;
- struct file *file;
+ struct io_fixed_file *file_slot;
struct io_mapped_ubuf *buf;
};
};
@@ -744,6 +755,7 @@ enum {
REQ_F_NOWAIT_READ_BIT,
REQ_F_NOWAIT_WRITE_BIT,
REQ_F_ISREG_BIT,
+ REQ_F_FIXED_POLL_BIT,
/* not a real bit, just to check we're not overflowing the space */
__REQ_F_LAST_BIT,
@@ -795,6 +807,8 @@ enum {
REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT),
/* there is a linked timeout that has to be armed */
REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT),
+ /* already went through fixed poll handler */
+ REQ_F_FIXED_POLL = BIT(REQ_F_FIXED_POLL_BIT),
};
struct async_poll {
@@ -878,7 +892,12 @@ struct io_kiocb {
struct io_task_work io_task_work;
/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
struct hlist_node hash_node;
- struct async_poll *apoll;
+ union {
+ struct async_poll *apoll;
+ struct io_fixed_poll *fixed_poll;
+ };
+ struct list_head fixed_poll_node;
+
struct io_wq_work work;
const struct cred *creds;
@@ -6611,7 +6630,7 @@ static void io_clean_op(struct io_kiocb *req)
break;
}
}
- if ((req->flags & REQ_F_POLLED) && req->apoll) {
+ if (((req->flags & (REQ_F_POLLED | REQ_F_FIXED_POLL)) == REQ_F_POLLED) && req->apoll) {
kfree(req->apoll->double_poll);
kfree(req->apoll);
req->apoll = NULL;
@@ -6836,21 +6855,42 @@ static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file
file_slot->file_ptr = file_ptr;
}
+static inline void io_fixed_rpoll_set(struct io_fixed_file *file_slot, struct io_fixed_poll *rpoll)
+{
+ file_slot->rpoll = rpoll;
+}
+
+static inline void io_fixed_wpoll_set(struct io_fixed_file *file_slot, struct io_fixed_poll *wpoll)
+{
+ file_slot->wpoll = wpoll;
+}
+
static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx,
struct io_kiocb *req, int fd)
{
+ struct io_fixed_file *file_slot;
struct file *file;
unsigned long file_ptr;
+ const struct io_op_def *def = &io_op_defs[req->opcode];
if (unlikely((unsigned int)fd >= ctx->nr_user_files))
return NULL;
fd = array_index_nospec(fd, ctx->nr_user_files);
- file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
+ file_slot = io_fixed_file_slot(&ctx->file_table, fd);
+ file_ptr = file_slot->file_ptr;
file = (struct file *) (file_ptr & FFS_MASK);
file_ptr &= ~FFS_MASK;
/* mask in overlapping REQ_F and FFS bits */
req->flags |= (file_ptr << REQ_F_NOWAIT_READ_BIT);
io_req_set_rsrc_node(req);
+
+ if (def->pollin)
+ req->fixed_poll = file_slot->rpoll;
+ else if (def->pollout)
+ req->fixed_poll = file_slot->wpoll;
+ else
+ req->fixed_poll = NULL;
+
return file;
}
@@ -6947,8 +6987,13 @@ static void __io_queue_sqe(struct io_kiocb *req)
{
struct io_kiocb *linked_timeout;
int ret;
+ struct io_fixed_poll *fixed_poll = req->flags & REQ_F_FIXED_FILE ? req->fixed_poll : NULL;
+ unsigned long generation;
issue_sqe:
+ if (fixed_poll)
+ generation = READ_ONCE(fixed_poll->generation);
+
ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
/*
@@ -6972,7 +7017,27 @@ static void __io_queue_sqe(struct io_kiocb *req)
} else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
linked_timeout = io_prep_linked_timeout(req);
- switch (io_arm_poll_handler(req)) {
+ if (fixed_poll) {
+ if (req->flags & REQ_F_POLLED) {
+ ret = IO_APOLL_ABORTED;
+ } else {
+ spin_lock_irq(&fixed_poll->head->lock);
+ req->flags |= REQ_F_POLLED | REQ_F_FIXED_POLL;
+ /* new events happen */
+ if (generation != fixed_poll->generation) {
+ spin_unlock_irq(&fixed_poll->head->lock);
+ goto issue_sqe;
+ }
+ INIT_LIST_HEAD(&req->fixed_poll_node);
+ list_add_tail(&req->fixed_poll_node, &fixed_poll->list);
+ spin_unlock_irq(&fixed_poll->head->lock);
+ return;
+ }
+ } else {
+ ret = io_arm_poll_handler(req);
+ }
+
+ switch (ret) {
case IO_APOLL_READY:
if (linked_timeout)
io_unprep_linked_timeout(req);
@@ -7831,6 +7896,18 @@ static void io_free_file_tables(struct io_file_table *table)
table->files = NULL;
}
+static inline void io_remove_fixed_poll(struct io_fixed_poll *poll)
+{
+ if (!poll)
+ return;
+
+ spin_lock_irq(&poll->head->lock);
+ if (!list_empty(&poll->wait.entry))
+ list_del_init(&poll->wait.entry);
+ spin_unlock_irq(&poll->head->lock);
+ kfree(poll);
+}
+
static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
{
#if defined(CONFIG_UNIX)
@@ -7845,11 +7922,17 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
int i;
for (i = 0; i < ctx->nr_user_files; i++) {
- struct file *file;
+ struct io_fixed_file *file_slot = io_fixed_file_slot(&ctx->file_table, i);
+ struct file *file = (struct file *)(file_slot->file_ptr & FFS_MASK);
+ struct io_fixed_poll *rpoll = file_slot->rpoll;
+ struct io_fixed_poll *wpoll = file_slot->wpoll;
file = io_file_from_index(ctx, i);
- if (file)
+ if (file) {
+ io_remove_fixed_poll(rpoll);
+ io_remove_fixed_poll(wpoll);
fput(file);
+ }
}
#endif
io_free_file_tables(&ctx->file_table);
@@ -8094,7 +8177,12 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx)
static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
{
- struct file *file = prsrc->file;
+ struct io_fixed_file *file_slot = prsrc->file_slot;
+ struct file *file = (struct file *)(file_slot->file_ptr & FFS_MASK);
+ struct io_fixed_poll *rpoll = file_slot->rpoll;
+ struct io_fixed_poll *wpoll = file_slot->wpoll;
+
+
#if defined(CONFIG_UNIX)
struct sock *sock = ctx->ring_sock->sk;
struct sk_buff_head list, *head = &sock->sk_receive_queue;
@@ -8131,6 +8219,8 @@ static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
} else {
__skb_queue_tail(&list, skb);
}
+ io_remove_fixed_poll(rpoll);
+ io_remove_fixed_poll(wpoll);
fput(file);
file = NULL;
break;
@@ -8151,6 +8241,8 @@ static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
spin_unlock_irq(&head->lock);
}
#else
+ io_remove_fixed_poll(rpoll);
+ io_remove_fixed_poll(wpoll);
fput(file);
#endif
}
@@ -8204,10 +8296,118 @@ static void io_rsrc_put_work(struct work_struct *work)
}
}
+struct io_fixed_poll_table {
+ struct poll_table_struct pt;
+ struct io_fixed_poll *poll;
+ int nr_entries;
+ int error;
+};
+
+static void io_fixed_poll_task_func(struct io_kiocb *req, bool *locked)
+{
+ struct io_fixed_poll *poll = req->fixed_poll;
+ bool canceled = false;
+
+ trace_io_uring_task_run(req->ctx, req, req->opcode, req->user_data);
+
+ /* req->task == current here, checking PF_EXITING is safe */
+ if (unlikely(req->task->flags & PF_EXITING))
+ canceled = true;
+
+ if (!req->result && !canceled) {
+ struct poll_table_struct pt = { ._key = poll->events };
+
+ req->result = vfs_poll(req->file, &pt) & poll->events;
+ }
+ if (!req->result)
+ return;
+
+ if (!canceled)
+ io_req_task_submit(req, locked);
+ else
+ io_req_complete_failed(req, -ECANCELED);
+}
+
+static int io_fixed_poll_wake(struct wait_queue_entry *wait,
+ unsigned int mode, int sync, void *key)
+{
+ struct io_fixed_poll *poll = wait->private;
+ struct io_kiocb *req, *nxt;
+ __poll_t mask = key_to_poll(key);
+
+ /* for instances that support it check for an event match first: */
+ if (mask && !(mask & poll->events))
+ return 0;
+
+ poll->generation++;
+ list_for_each_entry_safe(req, nxt, &poll->list, fixed_poll_node) {
+ req->result = key_to_poll(key);
+ req->io_task_work.func = io_fixed_poll_task_func;
+ io_req_task_work_add(req);
+ list_del(&req->fixed_poll_node);
+ }
+
+ return 1;
+}
+
+static void io_fixed_poll_queue_proc(struct file *file, struct wait_queue_head *head,
+ struct poll_table_struct *p)
+{
+ struct io_fixed_poll_table *ipt;
+
+ ipt = container_of(p, struct io_fixed_poll_table, pt);
+ if (unlikely(ipt->nr_entries)) {
+ ipt->error = -EOPNOTSUPP;
+ return;
+ }
+ ipt->poll->head = head;
+ ipt->nr_entries++;
+ if (ipt->poll->events & EPOLLEXCLUSIVE)
+ add_wait_queue_exclusive(head, &ipt->poll->wait);
+ else
+ add_wait_queue(head, &ipt->poll->wait);
+}
+
+static inline struct io_fixed_poll *io_init_fixed_poll(struct file *file, bool pollin)
+{
+ struct io_fixed_poll *poll;
+ struct io_fixed_poll_table ipt;
+ __poll_t mask;
+ int rw = pollin ? READ : WRITE;
+
+ if (!file_can_poll(file) || !__io_file_supports_nowait(file, rw))
+ return NULL;
+
+ poll = kzalloc(sizeof(struct io_fixed_poll), GFP_KERNEL);
+ if (!poll)
+ return ERR_PTR(-ENOMEM);
+
+ ipt.pt._qproc = io_fixed_poll_queue_proc;
+ ipt.poll = poll;
+ ipt.error = 0;
+ ipt.nr_entries = 0;
+ if (pollin)
+ ipt.pt._key = POLLERR | POLLPRI | POLLIN | POLLRDNORM;
+ else
+ ipt.pt._key = POLLERR | POLLPRI | POLLOUT | POLLWRNORM;
+ INIT_LIST_HEAD(&poll->wait.entry);
+ INIT_LIST_HEAD(&poll->list);
+ init_waitqueue_func_entry(&poll->wait, io_fixed_poll_wake);
+ poll->wait.private = poll;
+ poll->events = ipt.pt._key;
+ mask = vfs_poll(file, &ipt.pt);
+ if (ipt.error && poll->head) {
+ io_remove_fixed_poll(poll);
+ return NULL;
+ }
+ return poll;
+}
+
static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args, u64 __user *tags)
{
__s32 __user *fds = (__s32 __user *) arg;
+ struct io_fixed_file *file_slot;
struct file *file;
int fd, ret;
unsigned i;
@@ -8233,6 +8433,8 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
goto out_free;
for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
+ struct io_fixed_poll *poll;
+
if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
ret = -EFAULT;
goto out_fput;
@@ -8261,7 +8463,19 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
fput(file);
goto out_fput;
}
- io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file);
+
+ file_slot = io_fixed_file_slot(&ctx->file_table, i);
+ io_fixed_file_set(file_slot, file);
+
+ poll = io_init_fixed_poll(file, true);
+ if (IS_ERR(poll))
+ goto out_fput;
+ io_fixed_rpoll_set(file_slot, poll);
+
+ poll = io_init_fixed_poll(file, false);
+ if (IS_ERR(poll))
+ goto out_fput;
+ io_fixed_wpoll_set(file_slot, poll);
}
ret = io_sqe_files_scm(ctx);
@@ -8274,9 +8488,17 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
return ret;
out_fput:
for (i = 0; i < ctx->nr_user_files; i++) {
- file = io_file_from_index(ctx, i);
- if (file)
+ file_slot = io_fixed_file_slot(&ctx->file_table, i);
+ file = (struct file *)(file_slot->file_ptr & FFS_MASK);
+
+ if (file) {
+ struct io_fixed_poll *rpoll = file_slot->rpoll;
+ struct io_fixed_poll *wpoll = file_slot->wpoll;
+
+ io_remove_fixed_poll(rpoll);
+ io_remove_fixed_poll(wpoll);
fput(file);
+ }
}
io_free_file_tables(&ctx->file_table);
ctx->nr_user_files = 0;
@@ -8412,6 +8634,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
int fd, i, err = 0;
unsigned int done;
bool needs_switch = false;
+ struct io_fixed_poll *rpoll, *wpoll;
if (!ctx->file_data)
return -ENXIO;
@@ -8437,9 +8660,8 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
file_slot = io_fixed_file_slot(&ctx->file_table, i);
if (file_slot->file_ptr) {
- file = (struct file *)(file_slot->file_ptr & FFS_MASK);
err = io_queue_rsrc_removal(data, up->offset + done,
- ctx->rsrc_node, file);
+ ctx->rsrc_node, file_slot);
if (err)
break;
file_slot->file_ptr = 0;
@@ -8472,6 +8694,16 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
fput(file);
break;
}
+
+ rpoll = io_init_fixed_poll(file, true);
+ wpoll = io_init_fixed_poll(file, false);
+ if (IS_ERR(rpoll) || IS_ERR(wpoll)) {
+ file_slot->file_ptr = 0;
+ fput(file);
+ break;
+ }
+ io_fixed_rpoll_set(file_slot, rpoll);
+ io_fixed_wpoll_set(file_slot, wpoll);
}
}
--
2.14.4.44.g2045bb6
^ permalink raw reply related [flat|nested] 3+ messages in thread
* Re: [RFC] io_uring: add fixed poll support
2021-10-09 7:56 [RFC] io_uring: add fixed poll support Xiaoguang Wang
@ 2021-10-21 7:07 ` Xiaoguang Wang
2021-10-21 14:49 ` Jens Axboe
1 sibling, 0 replies; 3+ messages in thread
From: Xiaoguang Wang @ 2021-10-21 7:07 UTC (permalink / raw)
To: io-uring; +Cc: axboe, asml.silence
hello,
Also ping this RFC patch :)
I'd like you have a look at my analysis in commit message, which at
least helps me to
realize that the first nowait try in io_uring also may introduce
non-trivial overhead.
Regards,
Xiaogaung Wang
> Recently I spend time to research io_uring's fast-poll and multi-shot's
> performance using network echo-server model. Previously I always thought
> fast-poll is better than multi-shot and will give better performance,
> but indeed multi-shot is almost always better than fast-poll in real
> test, which is very interesting. I use ebpf to have some measurements,
> it shows that whether fast-poll is excellent or not depends entirely on
> that the first nowait try in io_issue_sqe() succeeds or fails. Take
> io_recv operation as example(recv buffer is 16 bytes):
> 1) the first nowait succeeds, a simple io_recv() is enough.
> In my test machine, successful io_recv() consumes 1110ns averagely.
>
> 2) the first nowait fails, then we'll have some expensive work, which
> contains failed io_revc(), apoll allocations, vfs_poll(), miscellaneous
> initializations anc check in __io_arm_poll_handler() and a final
> successful io_recv(). Among then:
> failed io_revc() consumes 620ns averagely.
> vfs_poll() consumes 550ns averagely.
> I don't measure other overhead yet, but we can see if the first nowait
> try fails, we'll need at least 2290ns(620 + 550 + 1110) to complete it.
> In my echo server tests, 40% of first nowait io_recv() operations fails.
>
> From above measurements, it can explain why mulit-shot is better than
> multi-shot, mulit-shot can ensure the first nowait try succeed.
>
> Based on above measurements, I try to improve fast-poll a bit:
> 1. introduce fix poll support, currently it only works in file
> registered mode. With this feature, we can get rid of various repeated
> operations in io_arm_poll_handler(), contains apoll allocations,
> and miscellaneous initializations anc check.
> 2. introduce an event generation, which will increase monotonically.
> If there is no new event happen, we don't need to call vfs_poll(), just
> put req in a waitting list.
>
> With this patch, echo server tests(1000 connection, packet is 16 bytes)
> shows 1%~2% tps improvement.
>
> Signed-off-by: Xiaoguang Wang <[email protected]>
> ---
> fs/io_uring.c | 258 +++++++++++++++++++++++++++++++++++++++++++++++++++++++---
> 1 file changed, 245 insertions(+), 13 deletions(-)
>
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 8317c360f7a4..71df37c1eea6 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -216,9 +216,20 @@ struct io_overflow_cqe {
> struct list_head list;
> };
>
> +struct io_fixed_poll {
> + struct wait_queue_head *head;
> + struct list_head list;
> + struct wait_queue_entry wait;
> + unsigned long generation;
> + __poll_t events;
> +};
> +
> +
> struct io_fixed_file {
> /* file * with additional FFS_* flags */
> unsigned long file_ptr;
> + struct io_fixed_poll *rpoll;
> + struct io_fixed_poll *wpoll;
> };
>
> struct io_rsrc_put {
> @@ -226,7 +237,7 @@ struct io_rsrc_put {
> u64 tag;
> union {
> void *rsrc;
> - struct file *file;
> + struct io_fixed_file *file_slot;
> struct io_mapped_ubuf *buf;
> };
> };
> @@ -744,6 +755,7 @@ enum {
> REQ_F_NOWAIT_READ_BIT,
> REQ_F_NOWAIT_WRITE_BIT,
> REQ_F_ISREG_BIT,
> + REQ_F_FIXED_POLL_BIT,
>
> /* not a real bit, just to check we're not overflowing the space */
> __REQ_F_LAST_BIT,
> @@ -795,6 +807,8 @@ enum {
> REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT),
> /* there is a linked timeout that has to be armed */
> REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT),
> + /* already went through fixed poll handler */
> + REQ_F_FIXED_POLL = BIT(REQ_F_FIXED_POLL_BIT),
> };
>
> struct async_poll {
> @@ -878,7 +892,12 @@ struct io_kiocb {
> struct io_task_work io_task_work;
> /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
> struct hlist_node hash_node;
> - struct async_poll *apoll;
> + union {
> + struct async_poll *apoll;
> + struct io_fixed_poll *fixed_poll;
> + };
> + struct list_head fixed_poll_node;
> +
> struct io_wq_work work;
> const struct cred *creds;
>
> @@ -6611,7 +6630,7 @@ static void io_clean_op(struct io_kiocb *req)
> break;
> }
> }
> - if ((req->flags & REQ_F_POLLED) && req->apoll) {
> + if (((req->flags & (REQ_F_POLLED | REQ_F_FIXED_POLL)) == REQ_F_POLLED) && req->apoll) {
> kfree(req->apoll->double_poll);
> kfree(req->apoll);
> req->apoll = NULL;
> @@ -6836,21 +6855,42 @@ static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file
> file_slot->file_ptr = file_ptr;
> }
>
> +static inline void io_fixed_rpoll_set(struct io_fixed_file *file_slot, struct io_fixed_poll *rpoll)
> +{
> + file_slot->rpoll = rpoll;
> +}
> +
> +static inline void io_fixed_wpoll_set(struct io_fixed_file *file_slot, struct io_fixed_poll *wpoll)
> +{
> + file_slot->wpoll = wpoll;
> +}
> +
> static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx,
> struct io_kiocb *req, int fd)
> {
> + struct io_fixed_file *file_slot;
> struct file *file;
> unsigned long file_ptr;
> + const struct io_op_def *def = &io_op_defs[req->opcode];
>
> if (unlikely((unsigned int)fd >= ctx->nr_user_files))
> return NULL;
> fd = array_index_nospec(fd, ctx->nr_user_files);
> - file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
> + file_slot = io_fixed_file_slot(&ctx->file_table, fd);
> + file_ptr = file_slot->file_ptr;
> file = (struct file *) (file_ptr & FFS_MASK);
> file_ptr &= ~FFS_MASK;
> /* mask in overlapping REQ_F and FFS bits */
> req->flags |= (file_ptr << REQ_F_NOWAIT_READ_BIT);
> io_req_set_rsrc_node(req);
> +
> + if (def->pollin)
> + req->fixed_poll = file_slot->rpoll;
> + else if (def->pollout)
> + req->fixed_poll = file_slot->wpoll;
> + else
> + req->fixed_poll = NULL;
> +
> return file;
> }
>
> @@ -6947,8 +6987,13 @@ static void __io_queue_sqe(struct io_kiocb *req)
> {
> struct io_kiocb *linked_timeout;
> int ret;
> + struct io_fixed_poll *fixed_poll = req->flags & REQ_F_FIXED_FILE ? req->fixed_poll : NULL;
> + unsigned long generation;
>
> issue_sqe:
> + if (fixed_poll)
> + generation = READ_ONCE(fixed_poll->generation);
> +
> ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
>
> /*
> @@ -6972,7 +7017,27 @@ static void __io_queue_sqe(struct io_kiocb *req)
> } else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
> linked_timeout = io_prep_linked_timeout(req);
>
> - switch (io_arm_poll_handler(req)) {
> + if (fixed_poll) {
> + if (req->flags & REQ_F_POLLED) {
> + ret = IO_APOLL_ABORTED;
> + } else {
> + spin_lock_irq(&fixed_poll->head->lock);
> + req->flags |= REQ_F_POLLED | REQ_F_FIXED_POLL;
> + /* new events happen */
> + if (generation != fixed_poll->generation) {
> + spin_unlock_irq(&fixed_poll->head->lock);
> + goto issue_sqe;
> + }
> + INIT_LIST_HEAD(&req->fixed_poll_node);
> + list_add_tail(&req->fixed_poll_node, &fixed_poll->list);
> + spin_unlock_irq(&fixed_poll->head->lock);
> + return;
> + }
> + } else {
> + ret = io_arm_poll_handler(req);
> + }
> +
> + switch (ret) {
> case IO_APOLL_READY:
> if (linked_timeout)
> io_unprep_linked_timeout(req);
> @@ -7831,6 +7896,18 @@ static void io_free_file_tables(struct io_file_table *table)
> table->files = NULL;
> }
>
> +static inline void io_remove_fixed_poll(struct io_fixed_poll *poll)
> +{
> + if (!poll)
> + return;
> +
> + spin_lock_irq(&poll->head->lock);
> + if (!list_empty(&poll->wait.entry))
> + list_del_init(&poll->wait.entry);
> + spin_unlock_irq(&poll->head->lock);
> + kfree(poll);
> +}
> +
> static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
> {
> #if defined(CONFIG_UNIX)
> @@ -7845,11 +7922,17 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
> int i;
>
> for (i = 0; i < ctx->nr_user_files; i++) {
> - struct file *file;
> + struct io_fixed_file *file_slot = io_fixed_file_slot(&ctx->file_table, i);
> + struct file *file = (struct file *)(file_slot->file_ptr & FFS_MASK);
> + struct io_fixed_poll *rpoll = file_slot->rpoll;
> + struct io_fixed_poll *wpoll = file_slot->wpoll;
>
> file = io_file_from_index(ctx, i);
> - if (file)
> + if (file) {
> + io_remove_fixed_poll(rpoll);
> + io_remove_fixed_poll(wpoll);
> fput(file);
> + }
> }
> #endif
> io_free_file_tables(&ctx->file_table);
> @@ -8094,7 +8177,12 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx)
>
> static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
> {
> - struct file *file = prsrc->file;
> + struct io_fixed_file *file_slot = prsrc->file_slot;
> + struct file *file = (struct file *)(file_slot->file_ptr & FFS_MASK);
> + struct io_fixed_poll *rpoll = file_slot->rpoll;
> + struct io_fixed_poll *wpoll = file_slot->wpoll;
> +
> +
> #if defined(CONFIG_UNIX)
> struct sock *sock = ctx->ring_sock->sk;
> struct sk_buff_head list, *head = &sock->sk_receive_queue;
> @@ -8131,6 +8219,8 @@ static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
> } else {
> __skb_queue_tail(&list, skb);
> }
> + io_remove_fixed_poll(rpoll);
> + io_remove_fixed_poll(wpoll);
> fput(file);
> file = NULL;
> break;
> @@ -8151,6 +8241,8 @@ static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
> spin_unlock_irq(&head->lock);
> }
> #else
> + io_remove_fixed_poll(rpoll);
> + io_remove_fixed_poll(wpoll);
> fput(file);
> #endif
> }
> @@ -8204,10 +8296,118 @@ static void io_rsrc_put_work(struct work_struct *work)
> }
> }
>
> +struct io_fixed_poll_table {
> + struct poll_table_struct pt;
> + struct io_fixed_poll *poll;
> + int nr_entries;
> + int error;
> +};
> +
> +static void io_fixed_poll_task_func(struct io_kiocb *req, bool *locked)
> +{
> + struct io_fixed_poll *poll = req->fixed_poll;
> + bool canceled = false;
> +
> + trace_io_uring_task_run(req->ctx, req, req->opcode, req->user_data);
> +
> + /* req->task == current here, checking PF_EXITING is safe */
> + if (unlikely(req->task->flags & PF_EXITING))
> + canceled = true;
> +
> + if (!req->result && !canceled) {
> + struct poll_table_struct pt = { ._key = poll->events };
> +
> + req->result = vfs_poll(req->file, &pt) & poll->events;
> + }
> + if (!req->result)
> + return;
> +
> + if (!canceled)
> + io_req_task_submit(req, locked);
> + else
> + io_req_complete_failed(req, -ECANCELED);
> +}
> +
> +static int io_fixed_poll_wake(struct wait_queue_entry *wait,
> + unsigned int mode, int sync, void *key)
> +{
> + struct io_fixed_poll *poll = wait->private;
> + struct io_kiocb *req, *nxt;
> + __poll_t mask = key_to_poll(key);
> +
> + /* for instances that support it check for an event match first: */
> + if (mask && !(mask & poll->events))
> + return 0;
> +
> + poll->generation++;
> + list_for_each_entry_safe(req, nxt, &poll->list, fixed_poll_node) {
> + req->result = key_to_poll(key);
> + req->io_task_work.func = io_fixed_poll_task_func;
> + io_req_task_work_add(req);
> + list_del(&req->fixed_poll_node);
> + }
> +
> + return 1;
> +}
> +
> +static void io_fixed_poll_queue_proc(struct file *file, struct wait_queue_head *head,
> + struct poll_table_struct *p)
> +{
> + struct io_fixed_poll_table *ipt;
> +
> + ipt = container_of(p, struct io_fixed_poll_table, pt);
> + if (unlikely(ipt->nr_entries)) {
> + ipt->error = -EOPNOTSUPP;
> + return;
> + }
> + ipt->poll->head = head;
> + ipt->nr_entries++;
> + if (ipt->poll->events & EPOLLEXCLUSIVE)
> + add_wait_queue_exclusive(head, &ipt->poll->wait);
> + else
> + add_wait_queue(head, &ipt->poll->wait);
> +}
> +
> +static inline struct io_fixed_poll *io_init_fixed_poll(struct file *file, bool pollin)
> +{
> + struct io_fixed_poll *poll;
> + struct io_fixed_poll_table ipt;
> + __poll_t mask;
> + int rw = pollin ? READ : WRITE;
> +
> + if (!file_can_poll(file) || !__io_file_supports_nowait(file, rw))
> + return NULL;
> +
> + poll = kzalloc(sizeof(struct io_fixed_poll), GFP_KERNEL);
> + if (!poll)
> + return ERR_PTR(-ENOMEM);
> +
> + ipt.pt._qproc = io_fixed_poll_queue_proc;
> + ipt.poll = poll;
> + ipt.error = 0;
> + ipt.nr_entries = 0;
> + if (pollin)
> + ipt.pt._key = POLLERR | POLLPRI | POLLIN | POLLRDNORM;
> + else
> + ipt.pt._key = POLLERR | POLLPRI | POLLOUT | POLLWRNORM;
> + INIT_LIST_HEAD(&poll->wait.entry);
> + INIT_LIST_HEAD(&poll->list);
> + init_waitqueue_func_entry(&poll->wait, io_fixed_poll_wake);
> + poll->wait.private = poll;
> + poll->events = ipt.pt._key;
> + mask = vfs_poll(file, &ipt.pt);
> + if (ipt.error && poll->head) {
> + io_remove_fixed_poll(poll);
> + return NULL;
> + }
> + return poll;
> +}
> +
> static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
> unsigned nr_args, u64 __user *tags)
> {
> __s32 __user *fds = (__s32 __user *) arg;
> + struct io_fixed_file *file_slot;
> struct file *file;
> int fd, ret;
> unsigned i;
> @@ -8233,6 +8433,8 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
> goto out_free;
>
> for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
> + struct io_fixed_poll *poll;
> +
> if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
> ret = -EFAULT;
> goto out_fput;
> @@ -8261,7 +8463,19 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
> fput(file);
> goto out_fput;
> }
> - io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file);
> +
> + file_slot = io_fixed_file_slot(&ctx->file_table, i);
> + io_fixed_file_set(file_slot, file);
> +
> + poll = io_init_fixed_poll(file, true);
> + if (IS_ERR(poll))
> + goto out_fput;
> + io_fixed_rpoll_set(file_slot, poll);
> +
> + poll = io_init_fixed_poll(file, false);
> + if (IS_ERR(poll))
> + goto out_fput;
> + io_fixed_wpoll_set(file_slot, poll);
> }
>
> ret = io_sqe_files_scm(ctx);
> @@ -8274,9 +8488,17 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
> return ret;
> out_fput:
> for (i = 0; i < ctx->nr_user_files; i++) {
> - file = io_file_from_index(ctx, i);
> - if (file)
> + file_slot = io_fixed_file_slot(&ctx->file_table, i);
> + file = (struct file *)(file_slot->file_ptr & FFS_MASK);
> +
> + if (file) {
> + struct io_fixed_poll *rpoll = file_slot->rpoll;
> + struct io_fixed_poll *wpoll = file_slot->wpoll;
> +
> + io_remove_fixed_poll(rpoll);
> + io_remove_fixed_poll(wpoll);
> fput(file);
> + }
> }
> io_free_file_tables(&ctx->file_table);
> ctx->nr_user_files = 0;
> @@ -8412,6 +8634,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
> int fd, i, err = 0;
> unsigned int done;
> bool needs_switch = false;
> + struct io_fixed_poll *rpoll, *wpoll;
>
> if (!ctx->file_data)
> return -ENXIO;
> @@ -8437,9 +8660,8 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
> file_slot = io_fixed_file_slot(&ctx->file_table, i);
>
> if (file_slot->file_ptr) {
> - file = (struct file *)(file_slot->file_ptr & FFS_MASK);
> err = io_queue_rsrc_removal(data, up->offset + done,
> - ctx->rsrc_node, file);
> + ctx->rsrc_node, file_slot);
> if (err)
> break;
> file_slot->file_ptr = 0;
> @@ -8472,6 +8694,16 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
> fput(file);
> break;
> }
> +
> + rpoll = io_init_fixed_poll(file, true);
> + wpoll = io_init_fixed_poll(file, false);
> + if (IS_ERR(rpoll) || IS_ERR(wpoll)) {
> + file_slot->file_ptr = 0;
> + fput(file);
> + break;
> + }
> + io_fixed_rpoll_set(file_slot, rpoll);
> + io_fixed_wpoll_set(file_slot, wpoll);
> }
> }
>
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [RFC] io_uring: add fixed poll support
2021-10-09 7:56 [RFC] io_uring: add fixed poll support Xiaoguang Wang
2021-10-21 7:07 ` Xiaoguang Wang
@ 2021-10-21 14:49 ` Jens Axboe
1 sibling, 0 replies; 3+ messages in thread
From: Jens Axboe @ 2021-10-21 14:49 UTC (permalink / raw)
To: Xiaoguang Wang, io-uring; +Cc: asml.silence
On 10/9/21 1:56 AM, Xiaoguang Wang wrote:
> Recently I spend time to research io_uring's fast-poll and multi-shot's
> performance using network echo-server model. Previously I always thought
> fast-poll is better than multi-shot and will give better performance,
> but indeed multi-shot is almost always better than fast-poll in real
> test, which is very interesting. I use ebpf to have some measurements,
> it shows that whether fast-poll is excellent or not depends entirely on
> that the first nowait try in io_issue_sqe() succeeds or fails. Take
> io_recv operation as example(recv buffer is 16 bytes):
> 1) the first nowait succeeds, a simple io_recv() is enough.
> In my test machine, successful io_recv() consumes 1110ns averagely.
>
> 2) the first nowait fails, then we'll have some expensive work, which
> contains failed io_revc(), apoll allocations, vfs_poll(), miscellaneous
> initializations anc check in __io_arm_poll_handler() and a final
> successful io_recv(). Among then:
> failed io_revc() consumes 620ns averagely.
> vfs_poll() consumes 550ns averagely.
> I don't measure other overhead yet, but we can see if the first nowait
> try fails, we'll need at least 2290ns(620 + 550 + 1110) to complete it.
> In my echo server tests, 40% of first nowait io_recv() operations fails.
>
> From above measurements, it can explain why mulit-shot is better than
> multi-shot, mulit-shot can ensure the first nowait try succeed.
>
> Based on above measurements, I try to improve fast-poll a bit:
> 1. introduce fix poll support, currently it only works in file
> registered mode. With this feature, we can get rid of various repeated
> operations in io_arm_poll_handler(), contains apoll allocations,
> and miscellaneous initializations anc check.
> 2. introduce an event generation, which will increase monotonically.
> If there is no new event happen, we don't need to call vfs_poll(), just
> put req in a waitting list.
This also needs a respin, and can you split it into two patches?
--
Jens Axboe
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2021-10-21 14:49 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2021-10-09 7:56 [RFC] io_uring: add fixed poll support Xiaoguang Wang
2021-10-21 7:07 ` Xiaoguang Wang
2021-10-21 14:49 ` Jens Axboe
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox