* [PATCH 1/8] io_uring: mask in error/nval/hangup consistently for poll
2021-03-19 20:35 [PATCHSET v2 0/8] Poll improvements Jens Axboe
@ 2021-03-19 20:35 ` Jens Axboe
2021-03-19 20:35 ` [PATCH 2/8] io_uring: allocate memory for overflowed CQEs Jens Axboe
` (6 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Jens Axboe @ 2021-03-19 20:35 UTC (permalink / raw)
To: io-uring; +Cc: Jens Axboe
Instead of masking these in as part of regular POLL_ADD prep, do it in
io_init_poll_iocb(), and include NVAL as that's generally unmaskable,
and RDHUP alongside the HUP that is already set.
Signed-off-by: Jens Axboe <[email protected]>
---
fs/io_uring.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 2c0e630a40d7..28bccb5ad4e6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4975,7 +4975,9 @@ static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
poll->head = NULL;
poll->done = false;
poll->canceled = false;
- poll->events = events;
+#define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
+ /* mask in events that we always want/need */
+ poll->events = events | IO_POLL_UNMASK;
INIT_LIST_HEAD(&poll->wait.entry);
init_waitqueue_func_entry(&poll->wait, wake_func);
}
@@ -5337,8 +5339,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
#ifdef __BIG_ENDIAN
events = swahw32(events);
#endif
- poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP |
- (events & EPOLLEXCLUSIVE);
+ poll->events = demangle_poll(events) | (events & EPOLLEXCLUSIVE);
return 0;
}
--
2.31.0
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH 2/8] io_uring: allocate memory for overflowed CQEs
2021-03-19 20:35 [PATCHSET v2 0/8] Poll improvements Jens Axboe
2021-03-19 20:35 ` [PATCH 1/8] io_uring: mask in error/nval/hangup consistently for poll Jens Axboe
@ 2021-03-19 20:35 ` Jens Axboe
2021-03-19 20:35 ` [PATCH 3/8] io_uring: include cflags in completion trace event Jens Axboe
` (5 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Jens Axboe @ 2021-03-19 20:35 UTC (permalink / raw)
To: io-uring; +Cc: Pavel Begunkov, Jens Axboe
From: Pavel Begunkov <[email protected]>
Instead of using a request itself for overflowed CQE stashing, allocate a
separate entry. The disadvantage is that the allocation may fail and it
will be accounted as lost (see rings->cq_overflow), so we lose reliability
in case of memory pressure if the application is driving the CQ ring into
overflow. However, it opens a way for for multiple CQEs per an SQE and
even generating SQE-less CQEs.
Signed-off-by: Pavel Begunkov <[email protected]>
[axboe: use GFP_ATOMIC | __GFP_ACCOUNT]
Signed-off-by: Jens Axboe <[email protected]>
---
fs/io_uring.c | 101 +++++++++++++++++++++++---------------------------
1 file changed, 46 insertions(+), 55 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 28bccb5ad4e6..a91033cc62c3 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -203,6 +203,11 @@ struct io_mapped_ubuf {
struct io_ring_ctx;
+struct io_overflow_cqe {
+ struct io_uring_cqe cqe;
+ struct list_head list;
+};
+
struct io_rsrc_put {
struct list_head list;
union {
@@ -1413,41 +1418,33 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
}
/* Returns true if there are no backlogged entries after the flush */
-static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
- struct task_struct *tsk,
- struct files_struct *files)
+static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
{
struct io_rings *rings = ctx->rings;
- struct io_kiocb *req, *tmp;
- struct io_uring_cqe *cqe;
unsigned long flags;
bool all_flushed, posted;
- LIST_HEAD(list);
if (!force && __io_cqring_events(ctx) == rings->cq_ring_entries)
return false;
posted = false;
spin_lock_irqsave(&ctx->completion_lock, flags);
- list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) {
- if (!io_match_task(req, tsk, files))
- continue;
+ while (!list_empty(&ctx->cq_overflow_list)) {
+ struct io_uring_cqe *cqe = io_get_cqring(ctx);
+ struct io_overflow_cqe *ocqe;
- cqe = io_get_cqring(ctx);
if (!cqe && !force)
break;
-
- list_move(&req->compl.list, &list);
- if (cqe) {
- WRITE_ONCE(cqe->user_data, req->user_data);
- WRITE_ONCE(cqe->res, req->result);
- WRITE_ONCE(cqe->flags, req->compl.cflags);
- } else {
- ctx->cached_cq_overflow++;
+ ocqe = list_first_entry(&ctx->cq_overflow_list,
+ struct io_overflow_cqe, list);
+ if (cqe)
+ memcpy(cqe, &ocqe->cqe, sizeof(*cqe));
+ else
WRITE_ONCE(ctx->rings->cq_overflow,
- ctx->cached_cq_overflow);
- }
+ ++ctx->cached_cq_overflow);
posted = true;
+ list_del(&ocqe->list);
+ kfree(ocqe);
}
all_flushed = list_empty(&ctx->cq_overflow_list);
@@ -1462,19 +1459,10 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
spin_unlock_irqrestore(&ctx->completion_lock, flags);
if (posted)
io_cqring_ev_posted(ctx);
-
- while (!list_empty(&list)) {
- req = list_first_entry(&list, struct io_kiocb, compl.list);
- list_del(&req->compl.list);
- io_put_req(req);
- }
-
return all_flushed;
}
-static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
- struct task_struct *tsk,
- struct files_struct *files)
+static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
{
bool ret = true;
@@ -1482,7 +1470,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
/* iopoll syncs against uring_lock, not completion_lock */
if (ctx->flags & IORING_SETUP_IOPOLL)
mutex_lock(&ctx->uring_lock);
- ret = __io_cqring_overflow_flush(ctx, force, tsk, files);
+ ret = __io_cqring_overflow_flush(ctx, force);
if (ctx->flags & IORING_SETUP_IOPOLL)
mutex_unlock(&ctx->uring_lock);
}
@@ -1543,29 +1531,33 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res,
WRITE_ONCE(cqe->user_data, req->user_data);
WRITE_ONCE(cqe->res, res);
WRITE_ONCE(cqe->flags, cflags);
- } else if (ctx->cq_overflow_flushed ||
- atomic_read(&req->task->io_uring->in_idle)) {
- /*
- * If we're in ring overflow flush mode, or in task cancel mode,
- * then we cannot store the request for later flushing, we need
- * to drop it on the floor.
- */
- ctx->cached_cq_overflow++;
- WRITE_ONCE(ctx->rings->cq_overflow, ctx->cached_cq_overflow);
- } else {
+ return;
+ }
+ if (!ctx->cq_overflow_flushed &&
+ !atomic_read(&req->task->io_uring->in_idle)) {
+ struct io_overflow_cqe *ocqe;
+
+ ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT);
+ if (!ocqe)
+ goto overflow;
if (list_empty(&ctx->cq_overflow_list)) {
set_bit(0, &ctx->sq_check_overflow);
set_bit(0, &ctx->cq_check_overflow);
ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
}
- if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED))
- io_clean_op(req);
-
- req->result = res;
- req->compl.cflags = cflags;
- req_ref_get(req);
- list_add_tail(&req->compl.list, &ctx->cq_overflow_list);
+ ocqe->cqe.user_data = req->user_data;
+ ocqe->cqe.res = res;
+ ocqe->cqe.flags = cflags;
+ list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
+ return;
}
+overflow:
+ /*
+ * If we're in ring overflow flush mode, or in task cancel mode,
+ * or cannot allocate an overflow entry, then we need to drop it
+ * on the floor.
+ */
+ WRITE_ONCE(ctx->rings->cq_overflow, ++ctx->cached_cq_overflow);
}
static void io_cqring_fill_event(struct io_kiocb *req, long res)
@@ -2410,7 +2402,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
* already triggered a CQE (eg in error).
*/
if (test_bit(0, &ctx->cq_check_overflow))
- __io_cqring_overflow_flush(ctx, false, NULL, NULL);
+ __io_cqring_overflow_flush(ctx, false);
if (io_cqring_events(ctx))
break;
@@ -6580,7 +6572,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
/* if we have a backlog and couldn't flush it all, return BUSY */
if (test_bit(0, &ctx->sq_check_overflow)) {
- if (!__io_cqring_overflow_flush(ctx, false, NULL, NULL))
+ if (!__io_cqring_overflow_flush(ctx, false))
return -EBUSY;
}
@@ -6873,7 +6865,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
int ret;
do {
- io_cqring_overflow_flush(ctx, false, NULL, NULL);
+ io_cqring_overflow_flush(ctx, false);
if (io_cqring_events(ctx) >= min_events)
return 0;
if (!io_run_task_work())
@@ -6905,7 +6897,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
trace_io_uring_cqring_wait(ctx, min_events);
do {
/* if we can't even flush overflow, don't wait for more */
- if (!io_cqring_overflow_flush(ctx, false, NULL, NULL)) {
+ if (!io_cqring_overflow_flush(ctx, false)) {
ret = -EBUSY;
break;
}
@@ -8578,7 +8570,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
/* if force is set, the ring is going away. always drop after that */
ctx->cq_overflow_flushed = 1;
if (ctx->rings)
- __io_cqring_overflow_flush(ctx, true, NULL, NULL);
+ __io_cqring_overflow_flush(ctx, true);
xa_for_each(&ctx->personalities, index, creds)
io_unregister_personality(ctx, index);
mutex_unlock(&ctx->uring_lock);
@@ -8736,7 +8728,6 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
ret |= io_kill_timeouts(ctx, task, files);
ret |= io_run_task_work();
ret |= io_run_ctx_fallback(ctx);
- io_cqring_overflow_flush(ctx, true, task, files);
if (!ret)
break;
cond_resched();
@@ -9153,7 +9144,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
*/
ret = 0;
if (ctx->flags & IORING_SETUP_SQPOLL) {
- io_cqring_overflow_flush(ctx, false, NULL, NULL);
+ io_cqring_overflow_flush(ctx, false);
ret = -EOWNERDEAD;
if (unlikely(ctx->sq_data->thread == NULL)) {
--
2.31.0
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH 3/8] io_uring: include cflags in completion trace event
2021-03-19 20:35 [PATCHSET v2 0/8] Poll improvements Jens Axboe
2021-03-19 20:35 ` [PATCH 1/8] io_uring: mask in error/nval/hangup consistently for poll Jens Axboe
2021-03-19 20:35 ` [PATCH 2/8] io_uring: allocate memory for overflowed CQEs Jens Axboe
@ 2021-03-19 20:35 ` Jens Axboe
2021-03-19 20:35 ` [PATCH 4/8] io_uring: add multishot mode for IORING_OP_POLL_ADD Jens Axboe
` (4 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Jens Axboe @ 2021-03-19 20:35 UTC (permalink / raw)
To: io-uring; +Cc: Jens Axboe
We should be including the completion flags for better introspection on
exactly what completion event was logged.
Signed-off-by: Jens Axboe <[email protected]>
---
fs/io_uring.c | 2 +-
include/trace/events/io_uring.h | 11 +++++++----
2 files changed, 8 insertions(+), 5 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index a91033cc62c3..a4987bbe523c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1519,7 +1519,7 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res,
struct io_ring_ctx *ctx = req->ctx;
struct io_uring_cqe *cqe;
- trace_io_uring_complete(ctx, req->user_data, res);
+ trace_io_uring_complete(ctx, req->user_data, res, cflags);
/*
* If we can't get a cq entry, userspace overflowed the
diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
index 9f0d3b7d56b0..bd528176a3d5 100644
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -290,29 +290,32 @@ TRACE_EVENT(io_uring_fail_link,
* @ctx: pointer to a ring context structure
* @user_data: user data associated with the request
* @res: result of the request
+ * @cflags: completion flags
*
*/
TRACE_EVENT(io_uring_complete,
- TP_PROTO(void *ctx, u64 user_data, long res),
+ TP_PROTO(void *ctx, u64 user_data, long res, unsigned cflags),
- TP_ARGS(ctx, user_data, res),
+ TP_ARGS(ctx, user_data, res, cflags),
TP_STRUCT__entry (
__field( void *, ctx )
__field( u64, user_data )
__field( long, res )
+ __field( unsigned, cflags )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->user_data = user_data;
__entry->res = res;
+ __entry->cflags = cflags;
),
- TP_printk("ring %p, user_data 0x%llx, result %ld",
+ TP_printk("ring %p, user_data 0x%llx, result %ld, cflags %x",
__entry->ctx, (unsigned long long)__entry->user_data,
- __entry->res)
+ __entry->res, __entry->cflags)
);
--
2.31.0
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH 4/8] io_uring: add multishot mode for IORING_OP_POLL_ADD
2021-03-19 20:35 [PATCHSET v2 0/8] Poll improvements Jens Axboe
` (2 preceding siblings ...)
2021-03-19 20:35 ` [PATCH 3/8] io_uring: include cflags in completion trace event Jens Axboe
@ 2021-03-19 20:35 ` Jens Axboe
2021-03-19 20:35 ` [PATCH 5/8] io_uring: abstract out helper for removing poll waitqs/hashes Jens Axboe
` (3 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Jens Axboe @ 2021-03-19 20:35 UTC (permalink / raw)
To: io-uring; +Cc: Jens Axboe
The default io_uring poll mode is one-shot, where once the event triggers,
the poll command is completed and won't trigger any further events. If
we're doing repeated polling on the same file or socket, then it can be
more efficient to do multishot, where we keep triggering whenever the
event becomes true.
This deviates from the usual norm of having one CQE per SQE submitted. Add
a CQE flag, IORING_CQE_F_MORE, which tells the application to expect
further completion events from the submitted SQE. Right now the only user
of this is POLL_ADD in multishot mode.
Since sqe->poll_events is using the space that we normally use for adding
flags to commands, use sqe->len for the flag space for POLL_ADD. Multishot
mode is selected by setting IORING_POLL_ADD_MULTI in sqe->len. An
application should expect more CQEs for the specificed SQE if the CQE is
flagged with IORING_CQE_F_MORE. In multishot mode, only cancelation or an
error will terminate the poll request, in which case the flag will be
cleared.
Signed-off-by: Jens Axboe <[email protected]>
---
fs/io_uring.c | 64 +++++++++++++++++++++++++----------
include/uapi/linux/io_uring.h | 12 +++++++
2 files changed, 58 insertions(+), 18 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index a4987bbe523c..ede66f620941 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4896,17 +4896,25 @@ static void io_poll_remove_double(struct io_kiocb *req)
}
}
-static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
+static bool io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
{
struct io_ring_ctx *ctx = req->ctx;
+ unsigned flags = IORING_CQE_F_MORE;
- if (!error && req->poll.canceled)
+ if (!error && req->poll.canceled) {
error = -ECANCELED;
-
- io_poll_remove_double(req);
- req->poll.done = true;
- io_cqring_fill_event(req, error ? error : mangle_poll(mask));
+ req->poll.events |= EPOLLONESHOT;
+ }
+ if (error || (req->poll.events & EPOLLONESHOT)) {
+ io_poll_remove_double(req);
+ req->poll.done = true;
+ flags = 0;
+ }
+ if (!error)
+ error = mangle_poll(mask);
+ __io_cqring_fill_event(req, error, flags);
io_commit_cqring(ctx);
+ return !(flags & IORING_CQE_F_MORE);
}
static void io_poll_task_func(struct callback_head *cb)
@@ -4918,14 +4926,25 @@ static void io_poll_task_func(struct callback_head *cb)
if (io_poll_rewait(req, &req->poll)) {
spin_unlock_irq(&ctx->completion_lock);
} else {
- hash_del(&req->hash_node);
- io_poll_complete(req, req->result, 0);
+ bool done, post_ev;
+
+ post_ev = done = io_poll_complete(req, req->result, 0);
+ if (done) {
+ hash_del(&req->hash_node);
+ } else if (!(req->poll.events & EPOLLONESHOT)) {
+ post_ev = true;
+ req->result = 0;
+ add_wait_queue(req->poll.head, &req->poll.wait);
+ }
spin_unlock_irq(&ctx->completion_lock);
- nxt = io_put_req_find_next(req);
- io_cqring_ev_posted(ctx);
- if (nxt)
- __io_req_task_submit(nxt);
+ if (post_ev)
+ io_cqring_ev_posted(ctx);
+ if (done) {
+ nxt = io_put_req_find_next(req);
+ if (nxt)
+ __io_req_task_submit(nxt);
+ }
}
}
@@ -4939,6 +4958,8 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
/* for instances that support it check for an event match first: */
if (mask && !(mask & poll->events))
return 0;
+ if (!(poll->events & EPOLLONESHOT))
+ return poll->wait.func(&poll->wait, mode, sync, key);
list_del_init(&wait->entry);
@@ -5104,7 +5125,7 @@ static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
ipt->error = 0;
mask = 0;
}
- if (mask || ipt->error)
+ if ((mask && (poll->events & EPOLLONESHOT)) || ipt->error)
list_del_init(&poll->wait.entry);
else if (cancel)
WRITE_ONCE(poll->canceled, true);
@@ -5147,7 +5168,7 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
req->flags |= REQ_F_POLLED;
req->apoll = apoll;
- mask = 0;
+ mask = EPOLLONESHOT;
if (def->pollin)
mask |= POLLIN | POLLRDNORM;
if (def->pollout)
@@ -5320,18 +5341,24 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_poll_iocb *poll = &req->poll;
- u32 events;
+ u32 events, flags;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
+ if (sqe->addr || sqe->ioprio || sqe->off || sqe->buf_index)
+ return -EINVAL;
+ flags = READ_ONCE(sqe->len);
+ if (flags & ~IORING_POLL_ADD_MULTI)
return -EINVAL;
events = READ_ONCE(sqe->poll32_events);
#ifdef __BIG_ENDIAN
events = swahw32(events);
#endif
- poll->events = demangle_poll(events) | (events & EPOLLEXCLUSIVE);
+ if (!flags)
+ events |= EPOLLONESHOT;
+ poll->events = demangle_poll(events) |
+ (events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
return 0;
}
@@ -5355,7 +5382,8 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
if (mask) {
io_cqring_ev_posted(ctx);
- io_put_req(req);
+ if (poll->events & EPOLLONESHOT)
+ io_put_req(req);
}
return ipt.error;
}
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 2514eb6b1cf2..76c967621601 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -159,6 +159,16 @@ enum {
*/
#define SPLICE_F_FD_IN_FIXED (1U << 31) /* the last bit of __u32 */
+/*
+ * POLL_ADD flags. Note that since sqe->poll_events is the flag space, the
+ * command flags for POLL_ADD are stored in sqe->len.
+ *
+ * IORING_POLL_ADD_MULTI Multishot poll. Sets IORING_CQE_F_MORE if
+ * the poll handler will continue to report
+ * CQEs on behalf of the same SQE.
+ */
+#define IORING_POLL_ADD_MULTI (1U << 0)
+
/*
* IO completion data structure (Completion Queue Entry)
*/
@@ -172,8 +182,10 @@ struct io_uring_cqe {
* cqe->flags
*
* IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID
+ * IORING_CQE_F_MORE If set, parent SQE will generate more CQE entries
*/
#define IORING_CQE_F_BUFFER (1U << 0)
+#define IORING_CQE_F_MORE (1U << 1)
enum {
IORING_CQE_BUFFER_SHIFT = 16,
--
2.31.0
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH 5/8] io_uring: abstract out helper for removing poll waitqs/hashes
2021-03-19 20:35 [PATCHSET v2 0/8] Poll improvements Jens Axboe
` (3 preceding siblings ...)
2021-03-19 20:35 ` [PATCH 4/8] io_uring: add multishot mode for IORING_OP_POLL_ADD Jens Axboe
@ 2021-03-19 20:35 ` Jens Axboe
2021-03-19 20:35 ` [PATCH 6/8] io_uring: terminate multishot poll for CQ ring overflow Jens Axboe
` (2 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Jens Axboe @ 2021-03-19 20:35 UTC (permalink / raw)
To: io-uring; +Cc: Jens Axboe
No functional changes in this patch, just preparation for kill multishot
poll on CQ overflow.
Signed-off-by: Jens Axboe <[email protected]>
---
fs/io_uring.c | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index ede66f620941..53378003bd3b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5214,7 +5214,7 @@ static bool __io_poll_remove_one(struct io_kiocb *req,
return do_complete;
}
-static bool io_poll_remove_one(struct io_kiocb *req)
+static bool io_poll_remove_waitqs(struct io_kiocb *req)
{
bool do_complete;
@@ -5234,6 +5234,14 @@ static bool io_poll_remove_one(struct io_kiocb *req)
}
}
+ return do_complete;
+}
+
+static bool io_poll_remove_one(struct io_kiocb *req)
+{
+ bool do_complete;
+
+ do_complete = io_poll_remove_waitqs(req);
if (do_complete) {
io_cqring_fill_event(req, -ECANCELED);
io_commit_cqring(req->ctx);
--
2.31.0
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH 6/8] io_uring: terminate multishot poll for CQ ring overflow
2021-03-19 20:35 [PATCHSET v2 0/8] Poll improvements Jens Axboe
` (4 preceding siblings ...)
2021-03-19 20:35 ` [PATCH 5/8] io_uring: abstract out helper for removing poll waitqs/hashes Jens Axboe
@ 2021-03-19 20:35 ` Jens Axboe
2021-03-19 20:35 ` [PATCH 7/8] io_uring: abstract out a io_poll_find_helper() Jens Axboe
2021-03-19 20:35 ` [PATCH 8/8] io_uring: allow events and user_data update of running poll requests Jens Axboe
7 siblings, 0 replies; 9+ messages in thread
From: Jens Axboe @ 2021-03-19 20:35 UTC (permalink / raw)
To: io-uring; +Cc: Jens Axboe
If we hit overflow and fail to allocate an overflow entry for the
completion, terminate the multishot poll mode.
Signed-off-by: Jens Axboe <[email protected]>
---
fs/io_uring.c | 20 ++++++++++++--------
1 file changed, 12 insertions(+), 8 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 53378003bd3b..103daef0db34 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1044,6 +1044,7 @@ static void io_rsrc_put_work(struct work_struct *work);
static void io_req_task_queue(struct io_kiocb *req);
static void io_submit_flush_completions(struct io_comp_state *cs,
struct io_ring_ctx *ctx);
+static bool io_poll_remove_waitqs(struct io_kiocb *req);
static int io_req_prep_async(struct io_kiocb *req);
static struct kmem_cache *req_cachep;
@@ -1513,7 +1514,7 @@ static inline void req_ref_get(struct io_kiocb *req)
atomic_inc(&req->refs);
}
-static void __io_cqring_fill_event(struct io_kiocb *req, long res,
+static bool __io_cqring_fill_event(struct io_kiocb *req, long res,
unsigned int cflags)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -1531,7 +1532,7 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res,
WRITE_ONCE(cqe->user_data, req->user_data);
WRITE_ONCE(cqe->res, res);
WRITE_ONCE(cqe->flags, cflags);
- return;
+ return true;
}
if (!ctx->cq_overflow_flushed &&
!atomic_read(&req->task->io_uring->in_idle)) {
@@ -1549,7 +1550,7 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res,
ocqe->cqe.res = res;
ocqe->cqe.flags = cflags;
list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
- return;
+ return true;
}
overflow:
/*
@@ -1558,6 +1559,7 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res,
* on the floor.
*/
WRITE_ONCE(ctx->rings->cq_overflow, ++ctx->cached_cq_overflow);
+ return false;
}
static void io_cqring_fill_event(struct io_kiocb *req, long res)
@@ -4905,14 +4907,14 @@ static bool io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
error = -ECANCELED;
req->poll.events |= EPOLLONESHOT;
}
- if (error || (req->poll.events & EPOLLONESHOT)) {
- io_poll_remove_double(req);
+ if (!error)
+ error = mangle_poll(mask);
+ if (!__io_cqring_fill_event(req, error, flags) ||
+ (req->poll.events & EPOLLONESHOT)) {
+ io_poll_remove_waitqs(req);
req->poll.done = true;
flags = 0;
}
- if (!error)
- error = mangle_poll(mask);
- __io_cqring_fill_event(req, error, flags);
io_commit_cqring(ctx);
return !(flags & IORING_CQE_F_MORE);
}
@@ -5203,6 +5205,8 @@ static bool __io_poll_remove_one(struct io_kiocb *req,
{
bool do_complete = false;
+ if (!poll->head)
+ return false;
spin_lock(&poll->head->lock);
WRITE_ONCE(poll->canceled, true);
if (!list_empty(&poll->wait.entry)) {
--
2.31.0
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH 7/8] io_uring: abstract out a io_poll_find_helper()
2021-03-19 20:35 [PATCHSET v2 0/8] Poll improvements Jens Axboe
` (5 preceding siblings ...)
2021-03-19 20:35 ` [PATCH 6/8] io_uring: terminate multishot poll for CQ ring overflow Jens Axboe
@ 2021-03-19 20:35 ` Jens Axboe
2021-03-19 20:35 ` [PATCH 8/8] io_uring: allow events and user_data update of running poll requests Jens Axboe
7 siblings, 0 replies; 9+ messages in thread
From: Jens Axboe @ 2021-03-19 20:35 UTC (permalink / raw)
To: io-uring; +Cc: Jens Axboe
We'll need this helper for another purpose, for now just abstract it
out and have io_poll_cancel() use it for lookups.
Signed-off-by: Jens Axboe <[email protected]>
---
fs/io_uring.c | 21 ++++++++++++++++-----
1 file changed, 16 insertions(+), 5 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 103daef0db34..55a7674eb3b6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5284,7 +5284,7 @@ static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
return posted != 0;
}
-static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
+static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr)
{
struct hlist_head *list;
struct io_kiocb *req;
@@ -5293,12 +5293,23 @@ static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
hlist_for_each_entry(req, list, hash_node) {
if (sqe_addr != req->user_data)
continue;
- if (io_poll_remove_one(req))
- return 0;
- return -EALREADY;
+ return req;
}
- return -ENOENT;
+ return NULL;
+}
+
+static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
+{
+ struct io_kiocb *req;
+
+ req = io_poll_find(ctx, sqe_addr);
+ if (!req)
+ return -ENOENT;
+ if (io_poll_remove_one(req))
+ return 0;
+
+ return -EALREADY;
}
static int io_poll_remove_prep(struct io_kiocb *req,
--
2.31.0
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH 8/8] io_uring: allow events and user_data update of running poll requests
2021-03-19 20:35 [PATCHSET v2 0/8] Poll improvements Jens Axboe
` (6 preceding siblings ...)
2021-03-19 20:35 ` [PATCH 7/8] io_uring: abstract out a io_poll_find_helper() Jens Axboe
@ 2021-03-19 20:35 ` Jens Axboe
7 siblings, 0 replies; 9+ messages in thread
From: Jens Axboe @ 2021-03-19 20:35 UTC (permalink / raw)
To: io-uring; +Cc: Jens Axboe
This adds two new POLL_ADD flags, IORING_POLL_UPDATE_EVENTS and
IORING_POLL_UPDATE_USER_DATA. As with the other POLL_ADD flag, these are
masked into sqe->len. If set, the POLL_ADD will have the following
behavior:
- sqe->addr must contain the the user_data of the poll request that
needs to be modified. This field is otherwise invalid for a POLL_ADD
command.
- If IORING_POLL_UPDATE_EVENTS is set, sqe->poll_events must contain the
new mask for the existing poll request. There are no checks for whether
these are identical or not, if a matching poll request is found, then it
is re-armed with the new mask.
- If IORING_POLL_UPDATE_USER_DATA is set, sqe->off must contain the new
user_data for the existing poll request.
A POLL_ADD with any of these flags set may complete with any of the
following results:
1) 0, which means that we successfully found the existing poll request
specified, and performed the re-arm procedure. Any error from that
re-arm will be exposed as a completion event for that original poll
request, not for the update request.
2) -ENOENT, if no existing poll request was found with the given
user_data.
3) -EALREADY, if the existing poll request was already in the process of
being removed/canceled/completing.
4) -EACCES, if an attempt was made to modify an internal poll request
(eg not one originally issued ass IORING_OP_POLL_ADD).
The usual -EINVAL cases apply as well, if any invalid fields are set
in the sqe for this command type.
Signed-off-by: Jens Axboe <[email protected]>
---
fs/io_uring.c | 90 ++++++++++++++++++++++++++++++++---
include/uapi/linux/io_uring.h | 5 ++
2 files changed, 89 insertions(+), 6 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 55a7674eb3b6..10b67041ca30 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -488,7 +488,15 @@ struct io_poll_iocb {
__poll_t events;
bool done;
bool canceled;
- struct wait_queue_entry wait;
+ bool update_events;
+ bool update_user_data;
+ union {
+ struct wait_queue_entry wait;
+ struct {
+ u64 old_user_data;
+ u64 new_user_data;
+ };
+ };
};
struct io_poll_remove {
@@ -4990,6 +4998,7 @@ static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
poll->head = NULL;
poll->done = false;
poll->canceled = false;
+ poll->update_events = poll->update_user_data = false;
#define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
/* mask in events that we always want/need */
poll->events = events | IO_POLL_UNMASK;
@@ -5368,24 +5377,36 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- if (sqe->addr || sqe->ioprio || sqe->off || sqe->buf_index)
+ if (sqe->ioprio || sqe->buf_index)
return -EINVAL;
flags = READ_ONCE(sqe->len);
- if (flags & ~IORING_POLL_ADD_MULTI)
+ if (flags & ~(IORING_POLL_ADD_MULTI | IORING_POLL_UPDATE_EVENTS |
+ IORING_POLL_UPDATE_USER_DATA))
return -EINVAL;
-
events = READ_ONCE(sqe->poll32_events);
#ifdef __BIG_ENDIAN
events = swahw32(events);
#endif
- if (!flags)
+ if (!(flags & IORING_POLL_ADD_MULTI))
events |= EPOLLONESHOT;
+ poll->update_events = poll->update_user_data = false;
+ if (flags & IORING_POLL_UPDATE_EVENTS) {
+ poll->update_events = true;
+ poll->old_user_data = READ_ONCE(sqe->addr);
+ }
+ if (flags & IORING_POLL_UPDATE_USER_DATA) {
+ poll->update_user_data = true;
+ poll->new_user_data = READ_ONCE(sqe->off);
+ }
+ if (!(poll->update_events || poll->update_user_data) &&
+ (sqe->off || sqe->addr))
+ return -EINVAL;
poll->events = demangle_poll(events) |
(events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
return 0;
}
-static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
+static int __io_poll_add(struct io_kiocb *req)
{
struct io_poll_iocb *poll = &req->poll;
struct io_ring_ctx *ctx = req->ctx;
@@ -5411,6 +5432,63 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
return ipt.error;
}
+static int io_poll_update(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_kiocb *preq;
+ int ret;
+
+ spin_lock_irq(&ctx->completion_lock);
+ preq = io_poll_find(ctx, req->poll.old_user_data);
+ if (!preq) {
+ ret = -ENOENT;
+ goto err;
+ } else if (preq->opcode != IORING_OP_POLL_ADD) {
+ /* don't allow internal poll updates */
+ ret = -EACCES;
+ goto err;
+ }
+ if (!__io_poll_remove_one(preq, &preq->poll)) {
+ /* in process of completing/removal */
+ ret = -EALREADY;
+ goto err;
+ }
+ /* we now have a detached poll request. reissue. */
+ ret = 0;
+err:
+ spin_unlock_irq(&ctx->completion_lock);
+ if (ret < 0) {
+ req_set_fail_links(req);
+ io_req_complete(req, ret);
+ return 0;
+ }
+ /* only mask one event flags, keep behavior flags */
+ if (req->poll.update_events) {
+ preq->poll.events &= ~0xffff;
+ preq->poll.events |= req->poll.events & 0xffff;
+ preq->poll.events |= IO_POLL_UNMASK;
+ }
+ if (req->poll.update_user_data)
+ preq->user_data = req->poll.new_user_data;
+
+ /* complete update request, we're done with it */
+ io_req_complete(req, ret);
+
+ ret = __io_poll_add(preq);
+ if (ret < 0) {
+ req_set_fail_links(preq);
+ io_req_complete(preq, ret);
+ }
+ return 0;
+}
+
+static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
+{
+ if (!req->poll.update_events && !req->poll.update_user_data)
+ return __io_poll_add(req);
+ return io_poll_update(req);
+}
+
static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
{
struct io_timeout_data *data = container_of(timer,
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 76c967621601..5beaa6bbc6db 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -166,8 +166,13 @@ enum {
* IORING_POLL_ADD_MULTI Multishot poll. Sets IORING_CQE_F_MORE if
* the poll handler will continue to report
* CQEs on behalf of the same SQE.
+ *
+ * IORING_POLL_UPDATE Update existing poll request, matching
+ * sqe->addr as the old user_data field.
*/
#define IORING_POLL_ADD_MULTI (1U << 0)
+#define IORING_POLL_UPDATE_EVENTS (1U << 1)
+#define IORING_POLL_UPDATE_USER_DATA (1U << 2)
/*
* IO completion data structure (Completion Queue Entry)
--
2.31.0
^ permalink raw reply related [flat|nested] 9+ messages in thread