From: Jens Axboe <[email protected]>
To: Mark Papadakis <[email protected]>, [email protected]
Subject: Re: io_uring and spurious wake-ups from eventfd
Date: Tue, 7 Jan 2020 13:34:11 -0700 [thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>
On 1/7/20 1:26 PM, Jens Axboe wrote:
> On 1/7/20 8:55 AM, Mark Papadakis wrote:
>> This is perhaps an odd request, but if it’s trivial to implement
>> support for this described feature, it could help others like it ‘d
>> help me (I ‘ve been experimenting with io_uring for some time now).
>>
>> Being able to register an eventfd with an io_uring context is very
>> handy, if you e.g have some sort of reactor thread multiplexing I/O
>> using epoll etc, where you want to be notified when there are pending
>> CQEs to drain. The problem, such as it is, is that this can result in
>> un-necessary/spurious wake-ups.
>>
>> If, for example, you are monitoring some sockets for EPOLLIN, and when
>> poll says you have pending bytes to read from their sockets, and said
>> sockets are non-blocking, and for each some reported event you reserve
>> an SQE for preadv() to read that data and then you io_uring_enter to
>> submit the SQEs, because the data is readily available, as soon as
>> io_uring_enter returns, you will have your completions available -
>> which you can process. The “problem” is that poll will wake up
>> immediately thereafter in the next reactor loop iteration because
>> eventfd was tripped (which is reasonable but un-necessary).
>>
>> What if there was a flag for io_uring_setup() so that the eventfd
>> would only be tripped for CQEs that were processed asynchronously, or,
>> if that’s non-trivial, only for CQEs that reference file FDs?
>>
>> That’d help with that spurious wake-up.
>
> One easy way to do that would be for the application to signal that it
> doesn't want eventfd notifications for certain requests. Like using an
> IOSQE_ flag for that. Then you could set that on the requests you submit
> in response to triggering an eventfd event.
Something like this - totally untested. Note that for some error cases I
didn't go through the trouble of wiring this up, but for all the normal
cases it should work.
Patch is against my for-5.6/io_uring branch.
diff --git a/fs/io_uring.c b/fs/io_uring.c
index c770c2c0eb52..5c6da1e1f29c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -512,6 +512,7 @@ struct io_kiocb {
#define REQ_F_HARDLINK 65536 /* doesn't sever on completion < 0 */
#define REQ_F_FORCE_ASYNC 131072 /* IOSQE_ASYNC */
#define REQ_F_CUR_POS 262144 /* read/write uses file position */
+#define REQ_F_NO_EVFD 524288 /* don't trigger eventfd for this req */
u64 user_data;
u32 result;
u32 sequence;
@@ -960,13 +961,13 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
return &rings->cqes[tail & ctx->cq_mask];
}
-static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
+static void io_cqring_ev_posted(struct io_ring_ctx *ctx, bool evfd)
{
if (waitqueue_active(&ctx->wait))
wake_up(&ctx->wait);
if (waitqueue_active(&ctx->sqo_wait))
wake_up(&ctx->sqo_wait);
- if (ctx->cq_ev_fd)
+ if (evfd && ctx->cq_ev_fd)
eventfd_signal(ctx->cq_ev_fd, 1);
}
@@ -1018,7 +1019,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
clear_bit(0, &ctx->cq_check_overflow);
}
spin_unlock_irqrestore(&ctx->completion_lock, flags);
- io_cqring_ev_posted(ctx);
+ io_cqring_ev_posted(ctx, true);
while (!list_empty(&list)) {
req = list_first_entry(&list, struct io_kiocb, list);
@@ -1070,7 +1071,7 @@ static void io_cqring_add_event(struct io_kiocb *req, long res)
io_commit_cqring(ctx);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
- io_cqring_ev_posted(ctx);
+ io_cqring_ev_posted(ctx, !(req->flags & REQ_F_NO_EVFD));
}
static inline bool io_is_fallback_req(struct io_kiocb *req)
@@ -1288,7 +1289,7 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
req->flags |= REQ_F_LINK_NEXT;
if (wake_ev)
- io_cqring_ev_posted(ctx);
+ io_cqring_ev_posted(ctx, true);
}
/*
@@ -1320,7 +1321,7 @@ static void io_fail_links(struct io_kiocb *req)
io_commit_cqring(ctx);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
- io_cqring_ev_posted(ctx);
+ io_cqring_ev_posted(ctx, true);
}
static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
@@ -3366,7 +3367,7 @@ static void io_poll_complete_work(struct io_wq_work **workptr)
io_poll_complete(req, mask, ret);
spin_unlock_irq(&ctx->completion_lock);
- io_cqring_ev_posted(ctx);
+ io_cqring_ev_posted(ctx, !(req->flags & REQ_F_NO_EVFD));
if (ret < 0)
req_set_fail_links(req);
@@ -3379,6 +3380,7 @@ static void __io_poll_flush(struct io_ring_ctx *ctx, struct llist_node *nodes)
{
struct io_kiocb *req, *tmp;
struct req_batch rb;
+ bool evfd = false;
rb.to_free = rb.need_iter = 0;
spin_lock_irq(&ctx->completion_lock);
@@ -3386,6 +3388,8 @@ static void __io_poll_flush(struct io_ring_ctx *ctx, struct llist_node *nodes)
hash_del(&req->hash_node);
io_poll_complete(req, req->result, 0);
+ if (!evfd && !(req->flags & REQ_F_NO_EVFD))
+ evfd = true;
if (refcount_dec_and_test(&req->refs) &&
!io_req_multi_free(&rb, req)) {
req->flags |= REQ_F_COMP_LOCKED;
@@ -3394,7 +3398,7 @@ static void __io_poll_flush(struct io_ring_ctx *ctx, struct llist_node *nodes)
}
spin_unlock_irq(&ctx->completion_lock);
- io_cqring_ev_posted(ctx);
+ io_cqring_ev_posted(ctx, evfd);
io_free_req_many(ctx, &rb);
}
@@ -3439,7 +3443,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
io_put_req(req);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
- io_cqring_ev_posted(ctx);
+ io_cqring_ev_posted(ctx, !(req->flags & REQ_F_NO_EVFD));
req = NULL;
} else {
req->result = mask;
@@ -3557,7 +3561,7 @@ static int io_poll_add(struct io_kiocb *req, struct io_kiocb **nxt)
spin_unlock_irq(&ctx->completion_lock);
if (mask) {
- io_cqring_ev_posted(ctx);
+ io_cqring_ev_posted(ctx, !(req->flags & REQ_F_NO_EVFD));
io_put_req_find_next(req, nxt);
}
return ipt.error;
@@ -3597,7 +3601,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
io_commit_cqring(ctx);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
- io_cqring_ev_posted(ctx);
+ io_cqring_ev_posted(ctx, !(req->flags & REQ_F_NO_EVFD));
req_set_fail_links(req);
io_put_req(req);
return HRTIMER_NORESTART;
@@ -3659,7 +3663,7 @@ static int io_timeout_remove(struct io_kiocb *req)
io_cqring_fill_event(req, ret);
io_commit_cqring(ctx);
spin_unlock_irq(&ctx->completion_lock);
- io_cqring_ev_posted(ctx);
+ io_cqring_ev_posted(ctx, !(req->flags & REQ_F_NO_EVFD));
if (ret < 0)
req_set_fail_links(req);
io_put_req(req);
@@ -3829,7 +3833,7 @@ static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
io_cqring_fill_event(req, ret);
io_commit_cqring(ctx);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
- io_cqring_ev_posted(ctx);
+ io_cqring_ev_posted(ctx, !(req->flags & REQ_F_NO_EVFD));
if (ret < 0)
req_set_fail_links(req);
@@ -4528,7 +4532,8 @@ static inline void io_queue_link_head(struct io_kiocb *req)
}
#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
- IOSQE_IO_HARDLINK | IOSQE_ASYNC)
+ IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
+ IOSQE_NO_EVENTFD)
static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
struct io_submit_state *state, struct io_kiocb **link)
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index be64c9757ff1..72af96a38ed8 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -53,6 +53,7 @@ struct io_uring_sqe {
#define IOSQE_IO_LINK (1U << 2) /* links next sqe */
#define IOSQE_IO_HARDLINK (1U << 3) /* like LINK, but stronger */
#define IOSQE_ASYNC (1U << 4) /* always go async */
+#define IOSQE_NO_EVENTFD (1U << 5) /* don't trigger eventfd */
/*
* io_uring_setup() flags
--
Jens Axboe
next prev parent reply other threads:[~2020-01-07 20:34 UTC|newest]
Thread overview: 11+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-01-07 15:55 io_uring and spurious wake-ups from eventfd Mark Papadakis
2020-01-07 20:26 ` Jens Axboe
2020-01-07 20:34 ` Jens Axboe [this message]
2020-01-08 7:36 ` Mark Papadakis
2020-01-08 16:24 ` Jens Axboe
2020-01-08 16:46 ` Mark Papadakis
2020-01-08 16:50 ` Jens Axboe
2020-01-08 17:20 ` Jens Axboe
2020-01-08 18:08 ` Jens Axboe
2020-01-09 6:09 ` Daurnimator
2020-01-09 15:14 ` Jens Axboe
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox