* [RFC v2 2/3] io_uring: add fixed poll support
2021-10-28 12:28 [RFC v2 1/3] io_uring: remove req->apoll check in io_clean_op() Xiaoguang Wang
@ 2021-10-28 12:28 ` Xiaoguang Wang
2023-03-01 17:35 ` Jens Axboe
2021-10-28 12:28 ` [RFC v2 3/3] io_uring: introduce event generation for fixed poll Xiaoguang Wang
1 sibling, 1 reply; 5+ messages in thread
From: Xiaoguang Wang @ 2021-10-28 12:28 UTC (permalink / raw)
To: io-uring; +Cc: axboe, asml.silence
Recently I spend time to research io_uring's fast-poll and multi-shot's
performance using network echo-server model. Previously I always thought
fast-poll is better than multi-shot and will give better performance,
but indeed multi-shot is almost always better than fast-poll in real
test, which is very interesting. I use ebpf to have some measurements,
it shows that whether fast-poll is excellent or not depends entirely on
that the first nowait try in io_issue_sqe() succeeds or fails. Take
io_recv operation as example(recv buffer is 16 bytes):
1) the first nowait succeeds, a simple io_recv() is enough.
In my test machine, successful io_recv() consumes 1110ns averagely.
2) the first nowait fails, then we'll have some expensive work, which
contains failed io_revc(), apoll allocations, vfs_poll(), miscellaneous
initializations anc check in __io_arm_poll_handler() and a final
successful io_recv(). Among then:
failed io_revc() consumes 620ns averagely.
vfs_poll() consumes 550ns averagely.
I don't measure other overhead yet, but we can see if the first nowait
try fails, we'll need at least 2290ns(620 + 550 + 1110) to complete it.
In my echo server tests, 40% of first nowait io_recv() operations fails.
From above measurements, it can explain why mulit-shot is better than
multi-shot, mulit-shot can ensure the first nowait try succeed.
Based on above measurements, I try to improve fast-poll a bit:
Introduce fix poll support, currently it only works in file registered
mode. With this feature, we can get rid of various repeated operations
in io_arm_poll_handler(), contains apoll allocations, and miscellaneous
initializations anc check.
Signed-off-by: Xiaoguang Wang <[email protected]>
---
fs/io_uring.c | 325 +++++++++++++++++++++++++++++++++++++++++++++++++++++-----
1 file changed, 300 insertions(+), 25 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 7361ae53cad3..6f63aea3e0c0 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -219,9 +219,19 @@ struct io_overflow_cqe {
struct list_head list;
};
+struct io_fixed_poll {
+ struct wait_queue_head *head;
+ struct list_head list;
+ struct wait_queue_entry wait;
+ __poll_t events;
+};
+
+
struct io_fixed_file {
/* file * with additional FFS_* flags */
unsigned long file_ptr;
+ struct io_fixed_poll *rpoll;
+ struct io_fixed_poll *wpoll;
};
struct io_rsrc_put {
@@ -229,7 +239,7 @@ struct io_rsrc_put {
u64 tag;
union {
void *rsrc;
- struct file *file;
+ struct io_fixed_file *file_slot;
struct io_mapped_ubuf *buf;
};
};
@@ -736,6 +746,7 @@ enum {
/* keep async read/write and isreg together and in order */
REQ_F_SUPPORT_NOWAIT_BIT,
REQ_F_ISREG_BIT,
+ REQ_F_FIXED_POLL_BIT,
/* not a real bit, just to check we're not overflowing the space */
__REQ_F_LAST_BIT,
@@ -787,6 +798,8 @@ enum {
REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT),
/* ->async_data allocated */
REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT),
+ /* already went through fixed poll handler */
+ REQ_F_FIXED_POLL = BIT(REQ_F_FIXED_POLL_BIT),
};
struct async_poll {
@@ -873,6 +886,9 @@ struct io_kiocb {
struct async_poll *apoll;
/* opcode allocated if it needs to store data for async defer */
void *async_data;
+ struct io_fixed_poll *fixed_poll;
+ struct list_head fixed_poll_node;
+
struct io_wq_work work;
/* custom credentials, valid IFF REQ_F_CREDS is set */
const struct cred *creds;
@@ -5281,7 +5297,10 @@ static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req)
/* pure poll stashes this in ->async_data, poll driven retry elsewhere */
if (req->opcode == IORING_OP_POLL_ADD)
return req->async_data;
- return req->apoll->double_poll;
+ else if (req->flags & REQ_F_FIXED_POLL)
+ return NULL;
+ else
+ return req->apoll->double_poll;
}
static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req)
@@ -5642,13 +5661,32 @@ static bool __io_poll_remove_one(struct io_kiocb *req,
return do_complete;
}
+static bool io_fixed_poll_remove_one(struct io_kiocb *req)
+ __must_hold(&req->ctx->completion_lock)
+{
+ struct io_fixed_poll *fixed_poll = req->fixed_poll;
+ bool do_complete = false;
+
+ spin_lock_irq(&fixed_poll->head->lock);
+ if (!list_empty(&req->fixed_poll_node)) {
+ list_del_init(&req->fixed_poll_node);
+ do_complete = true;
+ }
+ spin_unlock_irq(&fixed_poll->head->lock);
+ hash_del(&req->hash_node);
+ return do_complete;
+}
+
static bool io_poll_remove_one(struct io_kiocb *req)
__must_hold(&req->ctx->completion_lock)
{
bool do_complete;
io_poll_remove_double(req);
- do_complete = __io_poll_remove_one(req, io_poll_get_single(req), true);
+ if (req->flags & REQ_F_FIXED_POLL)
+ do_complete = io_fixed_poll_remove_one(req);
+ else
+ do_complete = __io_poll_remove_one(req, io_poll_get_single(req), true);
if (do_complete) {
io_cqring_fill_event(req->ctx, req->user_data, -ECANCELED, 0);
@@ -6816,18 +6854,29 @@ static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file
static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx,
struct io_kiocb *req, int fd)
{
+ struct io_fixed_file *file_slot;
struct file *file;
unsigned long file_ptr;
+ const struct io_op_def *def = &io_op_defs[req->opcode];
if (unlikely((unsigned int)fd >= ctx->nr_user_files))
return NULL;
fd = array_index_nospec(fd, ctx->nr_user_files);
- file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
+ file_slot = io_fixed_file_slot(&ctx->file_table, fd);
+ file_ptr = file_slot->file_ptr;
file = (struct file *) (file_ptr & FFS_MASK);
file_ptr &= ~FFS_MASK;
/* mask in overlapping REQ_F and FFS bits */
req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT);
io_req_set_rsrc_node(req, ctx);
+
+ if (def->pollin)
+ req->fixed_poll = file_slot->rpoll;
+ else if (def->pollout)
+ req->fixed_poll = file_slot->wpoll;
+ else
+ req->fixed_poll = NULL;
+
return file;
}
@@ -6919,12 +6968,47 @@ static void io_queue_linked_timeout(struct io_kiocb *req)
io_put_req(req);
}
+static inline int io_arm_fixed_poll_handler(struct io_kiocb *req,
+ struct io_fixed_poll *fixed_poll)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct poll_table_struct pt = { ._key = fixed_poll->events };
+ __poll_t result;
+
+ if (req->flags & REQ_F_FIXED_POLL)
+ return IO_APOLL_ABORTED;
+
+ req->flags |= REQ_F_FIXED_POLL;
+ result = vfs_poll(req->file, &pt) & fixed_poll->events;
+ if (result)
+ return IO_APOLL_READY;
+
+ spin_lock(&ctx->completion_lock);
+ spin_lock_irq(&fixed_poll->head->lock);
+ INIT_LIST_HEAD(&req->fixed_poll_node);
+ list_add_tail(&req->fixed_poll_node, &fixed_poll->list);
+ io_poll_req_insert(req);
+ spin_unlock_irq(&fixed_poll->head->lock);
+ spin_unlock(&ctx->completion_lock);
+ return IO_APOLL_OK;
+}
+
static void io_queue_sqe_arm_apoll(struct io_kiocb *req)
__must_hold(&req->ctx->uring_lock)
{
struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
+ struct io_fixed_poll *fixed_poll = NULL;
+ int ret;
- switch (io_arm_poll_handler(req)) {
+ if (req->flags & REQ_F_FIXED_FILE)
+ fixed_poll = req->fixed_poll;
+
+ if (fixed_poll)
+ ret = io_arm_fixed_poll_handler(req, fixed_poll);
+ else
+ ret = io_arm_poll_handler(req);
+
+ switch (ret) {
case IO_APOLL_READY:
if (linked_timeout) {
io_unprep_linked_timeout(req);
@@ -7846,8 +7930,22 @@ static void io_free_file_tables(struct io_file_table *table)
table->files = NULL;
}
+static inline void io_remove_fixed_poll(struct io_fixed_poll *poll)
+{
+ if (!poll)
+ return;
+
+ spin_lock_irq(&poll->head->lock);
+ if (!list_empty(&poll->wait.entry))
+ list_del_init(&poll->wait.entry);
+ spin_unlock_irq(&poll->head->lock);
+ kfree(poll);
+}
+
static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
{
+ int i;
+
#if defined(CONFIG_UNIX)
if (ctx->ring_sock) {
struct sock *sock = ctx->ring_sock->sk;
@@ -7856,17 +7954,24 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
kfree_skb(skb);
}
-#else
- int i;
+
+#endif
for (i = 0; i < ctx->nr_user_files; i++) {
- struct file *file;
+ struct io_fixed_file *file_slot = io_fixed_file_slot(&ctx->file_table, i);
+ struct file *file = (struct file *)(file_slot->file_ptr & FFS_MASK);
+ struct io_fixed_poll *rpoll = file_slot->rpoll;
+ struct io_fixed_poll *wpoll = file_slot->wpoll;
file = io_file_from_index(ctx, i);
- if (file)
+ if (file) {
+ io_remove_fixed_poll(rpoll);
+ io_remove_fixed_poll(wpoll);
+#if !defined(CONFIG_UNIX)
fput(file);
- }
#endif
+ }
+ }
io_free_file_tables(&ctx->file_table);
io_rsrc_data_free(ctx->file_data);
ctx->file_data = NULL;
@@ -8109,7 +8214,12 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx)
static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
{
- struct file *file = prsrc->file;
+ struct io_fixed_file *file_slot = prsrc->file_slot;
+ struct file *file = (struct file *)(file_slot->file_ptr & FFS_MASK);
+ struct io_fixed_poll *rpoll = file_slot->rpoll;
+ struct io_fixed_poll *wpoll = file_slot->wpoll;
+
+
#if defined(CONFIG_UNIX)
struct sock *sock = ctx->ring_sock->sk;
struct sk_buff_head list, *head = &sock->sk_receive_queue;
@@ -8146,6 +8256,8 @@ static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
} else {
__skb_queue_tail(&list, skb);
}
+ io_remove_fixed_poll(rpoll);
+ io_remove_fixed_poll(wpoll);
fput(file);
file = NULL;
break;
@@ -8166,8 +8278,13 @@ static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
spin_unlock_irq(&head->lock);
}
#else
+ io_remove_fixed_poll(rpoll);
+ io_remove_fixed_poll(wpoll);
fput(file);
#endif
+
+ /* free_slot is allocated in io_queue_remove_fixed_file(), free it here. */
+ kfree(file_slot);
}
static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
@@ -8219,10 +8336,140 @@ static void io_rsrc_put_work(struct work_struct *work)
}
}
+struct io_fixed_poll_table {
+ struct poll_table_struct pt;
+ struct io_fixed_poll *poll;
+ int nr_entries;
+ int error;
+};
+
+static void io_fixed_poll_task_func(struct io_kiocb *req, bool *locked)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_fixed_poll *poll = req->fixed_poll;
+ bool canceled = false;
+
+ trace_io_uring_task_run(req->ctx, req, req->opcode, req->user_data);
+
+ /* req->task == current here, checking PF_EXITING is safe */
+ if (unlikely(req->task->flags & PF_EXITING))
+ canceled = true;
+
+ if (!req->result && !canceled) {
+ struct poll_table_struct pt = { ._key = poll->events };
+
+ req->result = vfs_poll(req->file, &pt) & poll->events;
+ }
+ if (!req->result)
+ return;
+
+ spin_lock(&ctx->completion_lock);
+ hash_del(&req->hash_node);
+ spin_unlock(&ctx->completion_lock);
+
+ if (!canceled)
+ io_req_task_submit(req, locked);
+ else
+ io_req_complete_failed(req, -ECANCELED);
+}
+
+static int io_fixed_poll_wake(struct wait_queue_entry *wait,
+ unsigned int mode, int sync, void *key)
+{
+ struct io_fixed_poll *poll = wait->private;
+ struct io_kiocb *req, *nxt;
+ __poll_t mask = key_to_poll(key);
+
+ /* for instances that support it check for an event match first: */
+ if (mask && !(mask & poll->events))
+ return 0;
+
+ list_for_each_entry_safe(req, nxt, &poll->list, fixed_poll_node) {
+ req->result = mask;
+ req->io_task_work.func = io_fixed_poll_task_func;
+ io_req_task_work_add(req);
+ list_del_init(&req->fixed_poll_node);
+ }
+
+ return 1;
+}
+
+static void io_fixed_poll_queue_proc(struct file *file, struct wait_queue_head *head,
+ struct poll_table_struct *p)
+{
+ struct io_fixed_poll_table *ipt;
+
+ ipt = container_of(p, struct io_fixed_poll_table, pt);
+ if (unlikely(ipt->nr_entries)) {
+ ipt->error = -EOPNOTSUPP;
+ return;
+ }
+ ipt->poll->head = head;
+ ipt->nr_entries++;
+ if (ipt->poll->events & EPOLLEXCLUSIVE)
+ add_wait_queue_exclusive(head, &ipt->poll->wait);
+ else
+ add_wait_queue(head, &ipt->poll->wait);
+}
+
+static inline struct io_fixed_poll *io_init_fixed_poll(struct file *file, bool pollin)
+{
+ struct io_fixed_poll *poll;
+ struct io_fixed_poll_table ipt;
+ umode_t mode = file_inode(file)->i_mode;
+ __poll_t mask;
+
+ if (!file_can_poll(file) || !__io_file_supports_nowait(file, mode))
+ return NULL;
+
+ poll = kzalloc(sizeof(struct io_fixed_poll), GFP_KERNEL);
+ if (!poll)
+ return ERR_PTR(-ENOMEM);
+
+ ipt.pt._qproc = io_fixed_poll_queue_proc;
+ ipt.poll = poll;
+ ipt.error = 0;
+ ipt.nr_entries = 0;
+ if (pollin)
+ ipt.pt._key = POLLERR | POLLPRI | POLLIN | POLLRDNORM;
+ else
+ ipt.pt._key = POLLERR | POLLPRI | POLLOUT | POLLWRNORM;
+ INIT_LIST_HEAD(&poll->wait.entry);
+ INIT_LIST_HEAD(&poll->list);
+ init_waitqueue_func_entry(&poll->wait, io_fixed_poll_wake);
+ poll->wait.private = poll;
+ poll->events = ipt.pt._key;
+ mask = vfs_poll(file, &ipt.pt);
+ if (ipt.error && poll->head) {
+ io_remove_fixed_poll(poll);
+ return NULL;
+ }
+ return poll;
+}
+
+static void io_register_fixed_poll(struct io_fixed_file *file_slot, struct file *file)
+{
+ struct io_fixed_poll *rpoll, *wpoll;
+
+ rpoll = io_init_fixed_poll(file, true);
+ if (IS_ERR(rpoll))
+ return;
+
+ wpoll = io_init_fixed_poll(file, false);
+ if (IS_ERR(wpoll)) {
+ io_remove_fixed_poll(rpoll);
+ return;
+ }
+
+ file_slot->rpoll = rpoll;
+ file_slot->wpoll = wpoll;
+}
+
static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args, u64 __user *tags)
{
__s32 __user *fds = (__s32 __user *) arg;
+ struct io_fixed_file *file_slot;
struct file *file;
int fd, ret;
unsigned i;
@@ -8276,7 +8523,10 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
fput(file);
goto out_fput;
}
- io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file);
+
+ file_slot = io_fixed_file_slot(&ctx->file_table, i);
+ io_fixed_file_set(file_slot, file);
+ io_register_fixed_poll(file_slot, file);
}
ret = io_sqe_files_scm(ctx);
@@ -8289,9 +8539,17 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
return ret;
out_fput:
for (i = 0; i < ctx->nr_user_files; i++) {
- file = io_file_from_index(ctx, i);
- if (file)
+ file_slot = io_fixed_file_slot(&ctx->file_table, i);
+ file = (struct file *)(file_slot->file_ptr & FFS_MASK);
+
+ if (file) {
+ struct io_fixed_poll *rpoll = file_slot->rpoll;
+ struct io_fixed_poll *wpoll = file_slot->wpoll;
+
+ io_remove_fixed_poll(rpoll);
+ io_remove_fixed_poll(wpoll);
fput(file);
+ }
}
io_free_file_tables(&ctx->file_table);
ctx->nr_user_files = 0;
@@ -8359,6 +8617,25 @@ static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
return 0;
}
+static inline int io_queue_remove_fixed_file(struct io_rsrc_data *data, unsigned idx,
+ struct io_rsrc_node *node, struct io_fixed_file *file_slot)
+{
+ struct io_fixed_file *slot;
+ int ret;
+
+ slot = kzalloc(sizeof(struct io_fixed_file), GFP_KERNEL);
+ if (!slot)
+ return -ENOMEM;
+
+ slot->file_ptr = file_slot->file_ptr;
+ slot->rpoll = file_slot->rpoll;
+ slot->wpoll = file_slot->wpoll;
+ ret = io_queue_rsrc_removal(data, idx, node, slot);
+ if (ret < 0)
+ kfree(slot);
+ return ret;
+}
+
static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
unsigned int issue_flags, u32 slot_index)
{
@@ -8382,15 +8659,12 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
if (file_slot->file_ptr) {
- struct file *old_file;
-
ret = io_rsrc_node_switch_start(ctx);
if (ret)
goto err;
- old_file = (struct file *)(file_slot->file_ptr & FFS_MASK);
- ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
- ctx->rsrc_node, old_file);
+ ret = io_queue_remove_fixed_file(ctx->file_data, slot_index,
+ ctx->rsrc_node, file_slot);
if (ret)
goto err;
file_slot->file_ptr = 0;
@@ -8399,6 +8673,7 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
*io_get_tag_slot(ctx->file_data, slot_index) = 0;
io_fixed_file_set(file_slot, file);
+ io_register_fixed_poll(file_slot, file);
ret = io_sqe_file_register(ctx, file, slot_index);
if (ret) {
file_slot->file_ptr = 0;
@@ -8421,7 +8696,6 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
struct io_ring_ctx *ctx = req->ctx;
bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
struct io_fixed_file *file_slot;
- struct file *file;
int ret, i;
io_ring_submit_lock(ctx, needs_lock);
@@ -8441,8 +8715,7 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
if (!file_slot->file_ptr)
goto out;
- file = (struct file *)(file_slot->file_ptr & FFS_MASK);
- ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file);
+ ret = io_queue_remove_fixed_file(ctx->file_data, offset, ctx->rsrc_node, file_slot);
if (ret)
goto out;
@@ -8491,12 +8764,13 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
file_slot = io_fixed_file_slot(&ctx->file_table, i);
if (file_slot->file_ptr) {
- file = (struct file *)(file_slot->file_ptr & FFS_MASK);
- err = io_queue_rsrc_removal(data, up->offset + done,
- ctx->rsrc_node, file);
+ err = io_queue_remove_fixed_file(data, up->offset + done,
+ ctx->rsrc_node, file_slot);
if (err)
break;
file_slot->file_ptr = 0;
+ file_slot->rpoll = NULL;
+ file_slot->wpoll = NULL;
needs_switch = true;
}
if (fd != -1) {
@@ -8526,6 +8800,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
fput(file);
break;
}
+ io_register_fixed_poll(file_slot, file);
}
}
--
2.14.4.44.g2045bb6
^ permalink raw reply related [flat|nested] 5+ messages in thread