* [PATCH 01/13] io-wq: embed wqe ptr array into struct io_wq
2021-05-24 23:50 [PATCH for-next 00/13] 5.14 batch 2 Pavel Begunkov
@ 2021-05-24 23:51 ` Pavel Begunkov
2021-05-24 23:51 ` [PATCH 02/13] io-wq: remove unused io-wq refcounting Pavel Begunkov
` (11 subsequent siblings)
12 siblings, 0 replies; 20+ messages in thread
From: Pavel Begunkov @ 2021-05-24 23:51 UTC (permalink / raw)
To: Jens Axboe, io-uring
io-wq keeps an array of pointers to struct io_wqe, allocate this array
as a part of struct io-wq, it's easier to code and saves an extra
indirection for nearly each io-wq call.
Signed-off-by: Pavel Begunkov <[email protected]>
---
fs/io-wq.c | 15 ++++-----------
1 file changed, 4 insertions(+), 11 deletions(-)
diff --git a/fs/io-wq.c b/fs/io-wq.c
index de9b7ba3ba01..961c4dbf1220 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -102,7 +102,6 @@ struct io_wqe {
* Per io_wq state
*/
struct io_wq {
- struct io_wqe **wqes;
unsigned long state;
free_work_fn *free_work;
@@ -118,6 +117,8 @@ struct io_wq {
struct hlist_node cpuhp_node;
struct task_struct *task;
+
+ struct io_wqe *wqes[];
};
static enum cpuhp_state io_wq_online;
@@ -907,17 +908,12 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
if (WARN_ON_ONCE(!data->free_work || !data->do_work))
return ERR_PTR(-EINVAL);
- wq = kzalloc(sizeof(*wq), GFP_KERNEL);
+ wq = kzalloc(struct_size(wq, wqes, nr_node_ids), GFP_KERNEL);
if (!wq)
return ERR_PTR(-ENOMEM);
-
- wq->wqes = kcalloc(nr_node_ids, sizeof(struct io_wqe *), GFP_KERNEL);
- if (!wq->wqes)
- goto err_wq;
-
ret = cpuhp_state_add_instance_nocalls(io_wq_online, &wq->cpuhp_node);
if (ret)
- goto err_wqes;
+ goto err_wq;
refcount_inc(&data->hash->refs);
wq->hash = data->hash;
@@ -962,8 +958,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
for_each_node(node)
kfree(wq->wqes[node]);
-err_wqes:
- kfree(wq->wqes);
err_wq:
kfree(wq);
return ERR_PTR(ret);
@@ -1033,7 +1027,6 @@ static void io_wq_destroy(struct io_wq *wq)
kfree(wqe);
}
io_wq_put_hash(wq->hash);
- kfree(wq->wqes);
kfree(wq);
}
--
2.31.1
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH 02/13] io-wq: remove unused io-wq refcounting
2021-05-24 23:50 [PATCH for-next 00/13] 5.14 batch 2 Pavel Begunkov
2021-05-24 23:51 ` [PATCH 01/13] io-wq: embed wqe ptr array into struct io_wq Pavel Begunkov
@ 2021-05-24 23:51 ` Pavel Begunkov
2021-05-24 23:51 ` [PATCH 03/13] io_uring: refactor io_iopoll_req_issued Pavel Begunkov
` (10 subsequent siblings)
12 siblings, 0 replies; 20+ messages in thread
From: Pavel Begunkov @ 2021-05-24 23:51 UTC (permalink / raw)
To: Jens Axboe, io-uring
iowq->refs is initialised to one and killed on exit, so it's not used
and we can kill it.
Signed-off-by: Pavel Begunkov <[email protected]>
---
fs/io-wq.c | 6 +-----
1 file changed, 1 insertion(+), 5 deletions(-)
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 961c4dbf1220..a0e43d1b94af 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -109,8 +109,6 @@ struct io_wq {
struct io_wq_hash *hash;
- refcount_t refs;
-
atomic_t worker_refs;
struct completion worker_done;
@@ -949,7 +947,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
}
wq->task = get_task_struct(data->task);
- refcount_set(&wq->refs, 1);
atomic_set(&wq->worker_refs, 1);
init_completion(&wq->worker_done);
return wq;
@@ -1035,8 +1032,7 @@ void io_wq_put_and_exit(struct io_wq *wq)
WARN_ON_ONCE(!test_bit(IO_WQ_BIT_EXIT, &wq->state));
io_wq_exit_workers(wq);
- if (refcount_dec_and_test(&wq->refs))
- io_wq_destroy(wq);
+ io_wq_destroy(wq);
}
static bool io_wq_worker_affinity(struct io_worker *worker, void *data)
--
2.31.1
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH 03/13] io_uring: refactor io_iopoll_req_issued
2021-05-24 23:50 [PATCH for-next 00/13] 5.14 batch 2 Pavel Begunkov
2021-05-24 23:51 ` [PATCH 01/13] io-wq: embed wqe ptr array into struct io_wq Pavel Begunkov
2021-05-24 23:51 ` [PATCH 02/13] io-wq: remove unused io-wq refcounting Pavel Begunkov
@ 2021-05-24 23:51 ` Pavel Begunkov
2021-05-24 23:51 ` [PATCH 04/13] io_uring: rename function *task_file Pavel Begunkov
` (9 subsequent siblings)
12 siblings, 0 replies; 20+ messages in thread
From: Pavel Begunkov @ 2021-05-24 23:51 UTC (permalink / raw)
To: Jens Axboe, io-uring
A simple refactoring of io_iopoll_req_issued(), move in_async inside so
we don't pass it around and save on double checking it.
Signed-off-by: Pavel Begunkov <[email protected]>
---
fs/io_uring.c | 44 +++++++++++++++++++++-----------------------
1 file changed, 21 insertions(+), 23 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 057fbc8a190c..df1510adaaf7 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2520,9 +2520,14 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
* find it from a io_do_iopoll() thread before the issuer is done
* accessing the kiocb cookie.
*/
-static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async)
+static void io_iopoll_req_issued(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
+ const bool in_async = io_wq_current_is_worker();
+
+ /* workqueue context doesn't hold uring_lock, grab it now */
+ if (unlikely(in_async))
+ mutex_lock(&ctx->uring_lock);
/*
* Track whether we have multiple files in our lists. This will impact
@@ -2549,14 +2554,19 @@ static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async)
else
list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
- /*
- * If IORING_SETUP_SQPOLL is enabled, sqes are either handled in sq thread
- * task context or in io worker task context. If current task context is
- * sq thread, we don't need to check whether should wake up sq thread.
- */
- if (in_async && (ctx->flags & IORING_SETUP_SQPOLL) &&
- wq_has_sleeper(&ctx->sq_data->wait))
- wake_up(&ctx->sq_data->wait);
+ if (unlikely(in_async)) {
+ /*
+ * If IORING_SETUP_SQPOLL is enabled, sqes are either handle
+ * in sq thread task context or in io worker task context. If
+ * current task context is sq thread, we don't need to check
+ * whether should wake up sq thread.
+ */
+ if ((ctx->flags & IORING_SETUP_SQPOLL) &&
+ wq_has_sleeper(&ctx->sq_data->wait))
+ wake_up(&ctx->sq_data->wait);
+
+ mutex_unlock(&ctx->uring_lock);
+ }
}
static inline void io_state_file_put(struct io_submit_state *state)
@@ -6210,23 +6220,11 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
if (creds)
revert_creds(creds);
-
if (ret)
return ret;
-
/* If the op doesn't have a file, we're not polling for it */
- if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file) {
- const bool in_async = io_wq_current_is_worker();
-
- /* workqueue context doesn't hold uring_lock, grab it now */
- if (in_async)
- mutex_lock(&ctx->uring_lock);
-
- io_iopoll_req_issued(req, in_async);
-
- if (in_async)
- mutex_unlock(&ctx->uring_lock);
- }
+ if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file)
+ io_iopoll_req_issued(req);
return 0;
}
--
2.31.1
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH 04/13] io_uring: rename function *task_file
2021-05-24 23:50 [PATCH for-next 00/13] 5.14 batch 2 Pavel Begunkov
` (2 preceding siblings ...)
2021-05-24 23:51 ` [PATCH 03/13] io_uring: refactor io_iopoll_req_issued Pavel Begunkov
@ 2021-05-24 23:51 ` Pavel Begunkov
2021-05-24 23:51 ` [PATCH 05/13] io-wq: replace goto while Pavel Begunkov
` (8 subsequent siblings)
12 siblings, 0 replies; 20+ messages in thread
From: Pavel Begunkov @ 2021-05-24 23:51 UTC (permalink / raw)
To: Jens Axboe, io-uring
What at some moment was references to struct file used to control
lifetimes of task/ctx is now just internal tctx structures/nodes,
so rename outdated *task_file() routines into something more sensible.
Signed-off-by: Pavel Begunkov <[email protected]>
---
fs/io_uring.c | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index df1510adaaf7..c34df3e01151 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1024,7 +1024,7 @@ static const struct io_op_def io_op_defs[] = {
};
static bool io_disarm_next(struct io_kiocb *req);
-static void io_uring_del_task_file(unsigned long index);
+static void io_uring_del_tctx_node(unsigned long index);
static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
struct task_struct *task,
bool cancel_all);
@@ -8706,7 +8706,7 @@ static void io_tctx_exit_cb(struct callback_head *cb)
* node. It'll be removed by the end of cancellation, just ignore it.
*/
if (!atomic_read(&tctx->in_idle))
- io_uring_del_task_file((unsigned long)work->ctx);
+ io_uring_del_tctx_node((unsigned long)work->ctx);
complete(&work->completion);
}
@@ -8959,7 +8959,7 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
}
}
-static int __io_uring_add_task_file(struct io_ring_ctx *ctx)
+static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
{
struct io_uring_task *tctx = current->io_uring;
struct io_tctx_node *node;
@@ -8996,19 +8996,19 @@ static int __io_uring_add_task_file(struct io_ring_ctx *ctx)
/*
* Note that this task has used io_uring. We use it for cancelation purposes.
*/
-static inline int io_uring_add_task_file(struct io_ring_ctx *ctx)
+static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
{
struct io_uring_task *tctx = current->io_uring;
if (likely(tctx && tctx->last == ctx))
return 0;
- return __io_uring_add_task_file(ctx);
+ return __io_uring_add_tctx_node(ctx);
}
/*
* Remove this io_uring_file -> task mapping.
*/
-static void io_uring_del_task_file(unsigned long index)
+static void io_uring_del_tctx_node(unsigned long index)
{
struct io_uring_task *tctx = current->io_uring;
struct io_tctx_node *node;
@@ -9039,7 +9039,7 @@ static void io_uring_clean_tctx(struct io_uring_task *tctx)
tctx->io_wq = NULL;
xa_for_each(&tctx->xa, index, node)
- io_uring_del_task_file(index);
+ io_uring_del_tctx_node(index);
if (wq)
io_wq_put_and_exit(wq);
}
@@ -9317,7 +9317,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
}
submitted = to_submit;
} else if (to_submit) {
- ret = io_uring_add_task_file(ctx);
+ ret = io_uring_add_tctx_node(ctx);
if (unlikely(ret))
goto out;
mutex_lock(&ctx->uring_lock);
@@ -9527,7 +9527,7 @@ static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
if (fd < 0)
return fd;
- ret = io_uring_add_task_file(ctx);
+ ret = io_uring_add_tctx_node(ctx);
if (ret) {
put_unused_fd(fd);
return ret;
--
2.31.1
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH 05/13] io-wq: replace goto while
2021-05-24 23:50 [PATCH for-next 00/13] 5.14 batch 2 Pavel Begunkov
` (3 preceding siblings ...)
2021-05-24 23:51 ` [PATCH 04/13] io_uring: rename function *task_file Pavel Begunkov
@ 2021-05-24 23:51 ` Pavel Begunkov
2021-05-27 21:48 ` Noah Goldstein
2021-05-24 23:51 ` [PATCH 06/13] io-wq: don't repeat IO_WQ_BIT_EXIT check by worker Pavel Begunkov
` (7 subsequent siblings)
12 siblings, 1 reply; 20+ messages in thread
From: Pavel Begunkov @ 2021-05-24 23:51 UTC (permalink / raw)
To: Jens Axboe, io-uring
If for/while is simple enough it's more prefered over goto with labels
as the former one is more explicit and easier to read. So, replace a
trivial goto-based loop in io_wqe_worker() with a while.
Signed-off-by: Pavel Begunkov <[email protected]>
---
fs/io-wq.c | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/fs/io-wq.c b/fs/io-wq.c
index a0e43d1b94af..712eb062f822 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -538,12 +538,13 @@ static int io_wqe_worker(void *data)
long ret;
set_current_state(TASK_INTERRUPTIBLE);
-loop:
- raw_spin_lock_irq(&wqe->lock);
- if (io_wqe_run_queue(wqe)) {
+ while (1) {
+ raw_spin_lock_irq(&wqe->lock);
+ if (!io_wqe_run_queue(wqe))
+ break;
io_worker_handle_work(worker);
- goto loop;
}
+
__io_worker_idle(wqe, worker);
raw_spin_unlock_irq(&wqe->lock);
if (io_flush_signals())
--
2.31.1
^ permalink raw reply related [flat|nested] 20+ messages in thread
* Re: [PATCH 05/13] io-wq: replace goto while
2021-05-24 23:51 ` [PATCH 05/13] io-wq: replace goto while Pavel Begunkov
@ 2021-05-27 21:48 ` Noah Goldstein
2021-05-27 22:18 ` Pavel Begunkov
0 siblings, 1 reply; 20+ messages in thread
From: Noah Goldstein @ 2021-05-27 21:48 UTC (permalink / raw)
To: Pavel Begunkov; +Cc: Jens Axboe, open list:IO_URING
On Mon, May 24, 2021 at 7:51 PM Pavel Begunkov <[email protected]> wrote:
>
> If for/while is simple enough it's more prefered over goto with labels
> as the former one is more explicit and easier to read. So, replace a
> trivial goto-based loop in io_wqe_worker() with a while.
>
> Signed-off-by: Pavel Begunkov <[email protected]>
> ---
> fs/io-wq.c | 9 +++++----
> 1 file changed, 5 insertions(+), 4 deletions(-)
>
> diff --git a/fs/io-wq.c b/fs/io-wq.c
> index a0e43d1b94af..712eb062f822 100644
> --- a/fs/io-wq.c
> +++ b/fs/io-wq.c
> @@ -538,12 +538,13 @@ static int io_wqe_worker(void *data)
> long ret;
>
> set_current_state(TASK_INTERRUPTIBLE);
> -loop:
> - raw_spin_lock_irq(&wqe->lock);
> - if (io_wqe_run_queue(wqe)) {
> + while (1) {
> + raw_spin_lock_irq(&wqe->lock);
Can acquiring the spinlock be hoisted from the loop?
> + if (!io_wqe_run_queue(wqe))
> + break;
> io_worker_handle_work(worker);
> - goto loop;
> }
> +
> __io_worker_idle(wqe, worker);
> raw_spin_unlock_irq(&wqe->lock);
> if (io_flush_signals())
> --
> 2.31.1
>
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 05/13] io-wq: replace goto while
2021-05-27 21:48 ` Noah Goldstein
@ 2021-05-27 22:18 ` Pavel Begunkov
0 siblings, 0 replies; 20+ messages in thread
From: Pavel Begunkov @ 2021-05-27 22:18 UTC (permalink / raw)
To: Noah Goldstein; +Cc: Jens Axboe, open list:IO_URING
On 5/27/21 10:48 PM, Noah Goldstein wrote:
> On Mon, May 24, 2021 at 7:51 PM Pavel Begunkov <[email protected]> wrote:
>>
>> If for/while is simple enough it's more prefered over goto with labels
>> as the former one is more explicit and easier to read. So, replace a
>> trivial goto-based loop in io_wqe_worker() with a while.
>>
>> Signed-off-by: Pavel Begunkov <[email protected]>
>> ---
>> fs/io-wq.c | 9 +++++----
>> 1 file changed, 5 insertions(+), 4 deletions(-)
>>
>> diff --git a/fs/io-wq.c b/fs/io-wq.c
>> index a0e43d1b94af..712eb062f822 100644
>> --- a/fs/io-wq.c
>> +++ b/fs/io-wq.c
>> @@ -538,12 +538,13 @@ static int io_wqe_worker(void *data)
>> long ret;
>>
>> set_current_state(TASK_INTERRUPTIBLE);
>> -loop:
>> - raw_spin_lock_irq(&wqe->lock);
>> - if (io_wqe_run_queue(wqe)) {
>> + while (1) {
>> + raw_spin_lock_irq(&wqe->lock);
> Can acquiring the spinlock be hoisted from the loop?
lock();
while(1) { ... }
unlock();
If this, then no. If taken inside io_worker_handle_work(),
maybe, but not with this commit.
>> + if (!io_wqe_run_queue(wqe))
>> + break;
>> io_worker_handle_work(worker);
>> - goto loop;
>> }
>> +
>> __io_worker_idle(wqe, worker);
>> raw_spin_unlock_irq(&wqe->lock);
>> if (io_flush_signals())
>> --
>> 2.31.1
>>
--
Pavel Begunkov
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 06/13] io-wq: don't repeat IO_WQ_BIT_EXIT check by worker
2021-05-24 23:50 [PATCH for-next 00/13] 5.14 batch 2 Pavel Begunkov
` (4 preceding siblings ...)
2021-05-24 23:51 ` [PATCH 05/13] io-wq: replace goto while Pavel Begunkov
@ 2021-05-24 23:51 ` Pavel Begunkov
2021-05-24 23:51 ` [PATCH 07/13] io-wq: simplify worker exiting Pavel Begunkov
` (6 subsequent siblings)
12 siblings, 0 replies; 20+ messages in thread
From: Pavel Begunkov @ 2021-05-24 23:51 UTC (permalink / raw)
To: Jens Axboe, io-uring
io_wqe_worker()'s main loop does check IO_WQ_BIT_EXIT flag, so no need
for a second test_bit at the end as it will immediately jump to the
first check afterwards.
Signed-off-by: Pavel Begunkov <[email protected]>
---
fs/io-wq.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 712eb062f822..27a9ebbbf68e 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -560,8 +560,7 @@ static int io_wqe_worker(void *data)
if (ret)
continue;
/* timed out, exit unless we're the fixed worker */
- if (test_bit(IO_WQ_BIT_EXIT, &wq->state) ||
- !(worker->flags & IO_WORKER_F_FIXED))
+ if (!(worker->flags & IO_WORKER_F_FIXED))
break;
}
--
2.31.1
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH 07/13] io-wq: simplify worker exiting
2021-05-24 23:50 [PATCH for-next 00/13] 5.14 batch 2 Pavel Begunkov
` (5 preceding siblings ...)
2021-05-24 23:51 ` [PATCH 06/13] io-wq: don't repeat IO_WQ_BIT_EXIT check by worker Pavel Begunkov
@ 2021-05-24 23:51 ` Pavel Begunkov
2021-05-24 23:51 ` [PATCH 08/13] io_uring: hide rsrc tag copy into generic helpers Pavel Begunkov
` (5 subsequent siblings)
12 siblings, 0 replies; 20+ messages in thread
From: Pavel Begunkov @ 2021-05-24 23:51 UTC (permalink / raw)
To: Jens Axboe, io-uring
io_worker_handle_work() already takes care of the empty list case and
releases spinlock, so get rid of ugly conditional unlocking and
unconditionally call handle_work()
Signed-off-by: Pavel Begunkov <[email protected]>
---
fs/io-wq.c | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 27a9ebbbf68e..c57dd50d24d9 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -566,10 +566,7 @@ static int io_wqe_worker(void *data)
if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
raw_spin_lock_irq(&wqe->lock);
- if (!wq_list_empty(&wqe->work_list))
- io_worker_handle_work(worker);
- else
- raw_spin_unlock_irq(&wqe->lock);
+ io_worker_handle_work(worker);
}
io_worker_exit(worker);
--
2.31.1
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH 08/13] io_uring: hide rsrc tag copy into generic helpers
2021-05-24 23:50 [PATCH for-next 00/13] 5.14 batch 2 Pavel Begunkov
` (6 preceding siblings ...)
2021-05-24 23:51 ` [PATCH 07/13] io-wq: simplify worker exiting Pavel Begunkov
@ 2021-05-24 23:51 ` Pavel Begunkov
2021-05-24 23:51 ` [PATCH 09/13] io_uring: remove rsrc put work irq save/restore Pavel Begunkov
` (4 subsequent siblings)
12 siblings, 0 replies; 20+ messages in thread
From: Pavel Begunkov @ 2021-05-24 23:51 UTC (permalink / raw)
To: Jens Axboe, io-uring
Make io_rsrc_data_alloc() taking care of rsrc tags loading on
registration, so we don't need to repeat it for each new rsrc type.
Signed-off-by: Pavel Begunkov <[email protected]>
---
fs/io_uring.c | 55 +++++++++++++++++++++++++--------------------------
1 file changed, 27 insertions(+), 28 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index c34df3e01151..24178ca660c4 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7156,27 +7156,38 @@ static void io_rsrc_data_free(struct io_rsrc_data *data)
kfree(data);
}
-static struct io_rsrc_data *io_rsrc_data_alloc(struct io_ring_ctx *ctx,
- rsrc_put_fn *do_put,
- unsigned nr)
+static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
+ u64 __user *utags, unsigned nr,
+ struct io_rsrc_data **pdata)
{
struct io_rsrc_data *data;
+ unsigned i;
data = kzalloc(sizeof(*data), GFP_KERNEL);
if (!data)
- return NULL;
+ return -ENOMEM;
data->tags = kvcalloc(nr, sizeof(*data->tags), GFP_KERNEL);
if (!data->tags) {
kfree(data);
- return NULL;
+ return -ENOMEM;
+ }
+ if (utags) {
+ for (i = 0; i < nr; i++) {
+ if (copy_from_user(&data->tags[i], &utags[i],
+ sizeof(data->tags[i]))) {
+ io_rsrc_data_free(data);
+ return -EFAULT;
+ }
+ }
}
atomic_set(&data->refs, 1);
data->ctx = ctx;
data->do_put = do_put;
init_completion(&data->done);
- return data;
+ *pdata = data;
+ return 0;
}
static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
@@ -7628,7 +7639,6 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
struct file *file;
int fd, ret;
unsigned i;
- struct io_rsrc_data *file_data;
if (ctx->file_data)
return -EBUSY;
@@ -7639,27 +7649,24 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
ret = io_rsrc_node_switch_start(ctx);
if (ret)
return ret;
+ ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args,
+ &ctx->file_data);
+ if (ret)
+ return ret;
- file_data = io_rsrc_data_alloc(ctx, io_rsrc_file_put, nr_args);
- if (!file_data)
- return -ENOMEM;
- ctx->file_data = file_data;
ret = -ENOMEM;
if (!io_alloc_file_tables(&ctx->file_table, nr_args))
goto out_free;
for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
- u64 tag = 0;
-
- if ((tags && copy_from_user(&tag, &tags[i], sizeof(tag))) ||
- copy_from_user(&fd, &fds[i], sizeof(fd))) {
+ if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
ret = -EFAULT;
goto out_fput;
}
/* allow sparse sets */
if (fd == -1) {
ret = -EINVAL;
- if (unlikely(tag))
+ if (unlikely(ctx->file_data->tags[i]))
goto out_fput;
continue;
}
@@ -7680,7 +7687,6 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
fput(file);
goto out_fput;
}
- ctx->file_data->tags[i] = tag;
io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file);
}
@@ -8395,9 +8401,9 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
ret = io_rsrc_node_switch_start(ctx);
if (ret)
return ret;
- data = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, nr_args);
- if (!data)
- return -ENOMEM;
+ ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data);
+ if (ret)
+ return ret;
ret = io_buffers_map_alloc(ctx, nr_args);
if (ret) {
io_rsrc_data_free(data);
@@ -8405,19 +8411,13 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
}
for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
- u64 tag = 0;
-
- if (tags && copy_from_user(&tag, &tags[i], sizeof(tag))) {
- ret = -EFAULT;
- break;
- }
ret = io_copy_iov(ctx, &iov, arg, i);
if (ret)
break;
ret = io_buffer_validate(&iov);
if (ret)
break;
- if (!iov.iov_base && tag) {
+ if (!iov.iov_base && data->tags[i]) {
ret = -EINVAL;
break;
}
@@ -8426,7 +8426,6 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
&last_hpage);
if (ret)
break;
- data->tags[i] = tag;
}
WARN_ON_ONCE(ctx->buf_data);
--
2.31.1
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH 09/13] io_uring: remove rsrc put work irq save/restore
2021-05-24 23:50 [PATCH for-next 00/13] 5.14 batch 2 Pavel Begunkov
` (7 preceding siblings ...)
2021-05-24 23:51 ` [PATCH 08/13] io_uring: hide rsrc tag copy into generic helpers Pavel Begunkov
@ 2021-05-24 23:51 ` Pavel Begunkov
2021-05-24 23:51 ` [PATCH 10/13] io_uring: add helpers for 2 level table alloc Pavel Begunkov
` (3 subsequent siblings)
12 siblings, 0 replies; 20+ messages in thread
From: Pavel Begunkov @ 2021-05-24 23:51 UTC (permalink / raw)
To: Jens Axboe, io-uring
io_rsrc_put_work() is executed by workqueue in non-irq context, so no
need for irqsave/restore variants of spinlocking.
Signed-off-by: Pavel Begunkov <[email protected]>
---
fs/io_uring.c | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 24178ca660c4..40b70c34c1b2 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7550,14 +7550,13 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
if (prsrc->tag) {
bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL;
- unsigned long flags;
io_ring_submit_lock(ctx, lock_ring);
- spin_lock_irqsave(&ctx->completion_lock, flags);
+ spin_lock_irq(&ctx->completion_lock);
io_cqring_fill_event(ctx, prsrc->tag, 0, 0);
ctx->cq_extra++;
io_commit_cqring(ctx);
- spin_unlock_irqrestore(&ctx->completion_lock, flags);
+ spin_unlock_irq(&ctx->completion_lock);
io_cqring_ev_posted(ctx);
io_ring_submit_unlock(ctx, lock_ring);
}
--
2.31.1
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH 10/13] io_uring: add helpers for 2 level table alloc
2021-05-24 23:50 [PATCH for-next 00/13] 5.14 batch 2 Pavel Begunkov
` (8 preceding siblings ...)
2021-05-24 23:51 ` [PATCH 09/13] io_uring: remove rsrc put work irq save/restore Pavel Begunkov
@ 2021-05-24 23:51 ` Pavel Begunkov
2021-05-27 21:43 ` Noah Goldstein
2021-05-24 23:51 ` [PATCH 11/13] io_uring: don't vmalloc rsrc tags Pavel Begunkov
` (2 subsequent siblings)
12 siblings, 1 reply; 20+ messages in thread
From: Pavel Begunkov @ 2021-05-24 23:51 UTC (permalink / raw)
To: Jens Axboe, io-uring
Some parts like fixed file table use 2 level tables, factor out helpers
for allocating/deallocating them as more users are to come.
Signed-off-by: Pavel Begunkov <[email protected]>
---
fs/io_uring.c | 73 ++++++++++++++++++++++++++++++---------------------
1 file changed, 43 insertions(+), 30 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 40b70c34c1b2..1cc2d16637ff 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7054,14 +7054,36 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
}
-static void io_free_file_tables(struct io_file_table *table, unsigned nr_files)
+static void io_free_page_table(void **table, size_t size)
{
- unsigned i, nr_tables = DIV_ROUND_UP(nr_files, IORING_MAX_FILES_TABLE);
+ unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
for (i = 0; i < nr_tables; i++)
- kfree(table->files[i]);
- kfree(table->files);
- table->files = NULL;
+ kfree(table[i]);
+ kfree(table);
+}
+
+static void **io_alloc_page_table(size_t size)
+{
+ unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
+ size_t init_size = size;
+ void **table;
+
+ table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL);
+ if (!table)
+ return NULL;
+
+ for (i = 0; i < nr_tables; i++) {
+ unsigned int this_size = min(size, PAGE_SIZE);
+
+ table[i] = kzalloc(this_size, GFP_KERNEL);
+ if (!table[i]) {
+ io_free_page_table(table, init_size);
+ return NULL;
+ }
+ size -= this_size;
+ }
+ return table;
}
static inline void io_rsrc_ref_lock(struct io_ring_ctx *ctx)
@@ -7190,6 +7212,22 @@ static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
return 0;
}
+static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
+{
+ size_t size = nr_files * sizeof(struct io_fixed_file);
+
+ table->files = (struct io_fixed_file **)io_alloc_page_table(size);
+ return !!table->files;
+}
+
+static void io_free_file_tables(struct io_file_table *table, unsigned nr_files)
+{
+ size_t size = nr_files * sizeof(struct io_fixed_file);
+
+ io_free_page_table((void **)table->files, size);
+ table->files = NULL;
+}
+
static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
{
#if defined(CONFIG_UNIX)
@@ -7451,31 +7489,6 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx)
}
#endif
-static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
-{
- unsigned i, nr_tables = DIV_ROUND_UP(nr_files, IORING_MAX_FILES_TABLE);
-
- table->files = kcalloc(nr_tables, sizeof(*table->files), GFP_KERNEL);
- if (!table->files)
- return false;
-
- for (i = 0; i < nr_tables; i++) {
- unsigned int this_files = min(nr_files, IORING_MAX_FILES_TABLE);
-
- table->files[i] = kcalloc(this_files, sizeof(*table->files[i]),
- GFP_KERNEL);
- if (!table->files[i])
- break;
- nr_files -= this_files;
- }
-
- if (i == nr_tables)
- return true;
-
- io_free_file_tables(table, nr_tables * IORING_MAX_FILES_TABLE);
- return false;
-}
-
static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
{
struct file *file = prsrc->file;
--
2.31.1
^ permalink raw reply related [flat|nested] 20+ messages in thread
* Re: [PATCH 10/13] io_uring: add helpers for 2 level table alloc
2021-05-24 23:51 ` [PATCH 10/13] io_uring: add helpers for 2 level table alloc Pavel Begunkov
@ 2021-05-27 21:43 ` Noah Goldstein
2021-05-27 22:14 ` Pavel Begunkov
0 siblings, 1 reply; 20+ messages in thread
From: Noah Goldstein @ 2021-05-27 21:43 UTC (permalink / raw)
To: Pavel Begunkov; +Cc: Jens Axboe, open list:IO_URING
On Mon, May 24, 2021 at 7:51 PM Pavel Begunkov <[email protected]> wrote:
>
> Some parts like fixed file table use 2 level tables, factor out helpers
> for allocating/deallocating them as more users are to come.
>
> Signed-off-by: Pavel Begunkov <[email protected]>
> ---
> fs/io_uring.c | 73 ++++++++++++++++++++++++++++++---------------------
> 1 file changed, 43 insertions(+), 30 deletions(-)
>
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 40b70c34c1b2..1cc2d16637ff 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -7054,14 +7054,36 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
> return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
> }
>
> -static void io_free_file_tables(struct io_file_table *table, unsigned nr_files)
> +static void io_free_page_table(void **table, size_t size)
> {
> - unsigned i, nr_tables = DIV_ROUND_UP(nr_files, IORING_MAX_FILES_TABLE);
> + unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
>
> for (i = 0; i < nr_tables; i++)
> - kfree(table->files[i]);
> - kfree(table->files);
> - table->files = NULL;
> + kfree(table[i]);
> + kfree(table);
> +}
> +
> +static void **io_alloc_page_table(size_t size)
> +{
> + unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
> + size_t init_size = size;
> + void **table;
> +
> + table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL);
> + if (!table)
> + return NULL;
> +
> + for (i = 0; i < nr_tables; i++) {
> + unsigned int this_size = min(size, PAGE_SIZE);
> +
> + table[i] = kzalloc(this_size, GFP_KERNEL);
> + if (!table[i]) {
> + io_free_page_table(table, init_size);
> + return NULL;
Unless zalloc returns non-NULL for size == 0, you are guranteed to do
this for size <= PAGE_SIZE * (nr_tables - 1). Possibly worth calculating early?
If you calculate early you could then make the loop:
for (i = 0; i < nr_tables - 1; i++) {
table[i] = kzalloc(PAGE_SIZE, GFP_KERNEL);
if (!table[i]) {
io_free_page_table(table, init_size);
return NULL;
}
}
table[i] = kzalloc(size - (nr_tables - 1) * PAGE_SIZE, GFP_KERNEL);
if (!table[i]) {
io_free_page_table(table, init_size);
return NULL;
}
Which is almost certainly faster.
> + }
> + size -= this_size;
> + }
> + return table;
> }
>
> static inline void io_rsrc_ref_lock(struct io_ring_ctx *ctx)
> @@ -7190,6 +7212,22 @@ static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
> return 0;
> }
>
> +static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
> +{
> + size_t size = nr_files * sizeof(struct io_fixed_file);
> +
> + table->files = (struct io_fixed_file **)io_alloc_page_table(size);
> + return !!table->files;
> +}
> +
> +static void io_free_file_tables(struct io_file_table *table, unsigned nr_files)
> +{
> + size_t size = nr_files * sizeof(struct io_fixed_file);
> +
> + io_free_page_table((void **)table->files, size);
> + table->files = NULL;
> +}
> +
> static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
> {
> #if defined(CONFIG_UNIX)
> @@ -7451,31 +7489,6 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx)
> }
> #endif
>
> -static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
> -{
> - unsigned i, nr_tables = DIV_ROUND_UP(nr_files, IORING_MAX_FILES_TABLE);
> -
> - table->files = kcalloc(nr_tables, sizeof(*table->files), GFP_KERNEL);
> - if (!table->files)
> - return false;
> -
> - for (i = 0; i < nr_tables; i++) {
> - unsigned int this_files = min(nr_files, IORING_MAX_FILES_TABLE);
> -
> - table->files[i] = kcalloc(this_files, sizeof(*table->files[i]),
> - GFP_KERNEL);
> - if (!table->files[i])
> - break;
> - nr_files -= this_files;
> - }
> -
> - if (i == nr_tables)
> - return true;
> -
> - io_free_file_tables(table, nr_tables * IORING_MAX_FILES_TABLE);
> - return false;
> -}
> -
> static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
> {
> struct file *file = prsrc->file;
> --
> 2.31.1
>
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 10/13] io_uring: add helpers for 2 level table alloc
2021-05-27 21:43 ` Noah Goldstein
@ 2021-05-27 22:14 ` Pavel Begunkov
0 siblings, 0 replies; 20+ messages in thread
From: Pavel Begunkov @ 2021-05-27 22:14 UTC (permalink / raw)
To: Noah Goldstein; +Cc: Jens Axboe, open list:IO_URING
On 5/27/21 10:43 PM, Noah Goldstein wrote:
> On Mon, May 24, 2021 at 7:51 PM Pavel Begunkov <[email protected]> wrote:
>> -static void io_free_file_tables(struct io_file_table *table, unsigned nr_files)
>> +static void io_free_page_table(void **table, size_t size)
>> {
>> - unsigned i, nr_tables = DIV_ROUND_UP(nr_files, IORING_MAX_FILES_TABLE);
>> + unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
>>
>> for (i = 0; i < nr_tables; i++)
>> - kfree(table->files[i]);
>> - kfree(table->files);
>> - table->files = NULL;
>> + kfree(table[i]);
>> + kfree(table);
>> +}
>> +
>> +static void **io_alloc_page_table(size_t size)
>> +{
>> + unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
>> + size_t init_size = size;
>> + void **table;
>> +
>> + table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL);
>> + if (!table)
>> + return NULL;
>> +
>> + for (i = 0; i < nr_tables; i++) {
>> + unsigned int this_size = min(size, PAGE_SIZE);
>> +
>> + table[i] = kzalloc(this_size, GFP_KERNEL);
>> + if (!table[i]) {
>> + io_free_page_table(table, init_size);
>> + return NULL;
> Unless zalloc returns non-NULL for size == 0, you are guranteed to do
> this for size <= PAGE_SIZE * (nr_tables - 1). Possibly worth calculating early?
Far from being a fast path, so would rather keep it simpler
and cleaner
>
> If you calculate early you could then make the loop:
>
> for (i = 0; i < nr_tables - 1; i++) {
> table[i] = kzalloc(PAGE_SIZE, GFP_KERNEL);
> if (!table[i]) {
> io_free_page_table(table, init_size);
> return NULL;
> }
> }
>
> table[i] = kzalloc(size - (nr_tables - 1) * PAGE_SIZE, GFP_KERNEL);
> if (!table[i]) {
> io_free_page_table(table, init_size);
> return NULL;
> }
>
> Which is almost certainly faster.
--
Pavel Begunkov
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 11/13] io_uring: don't vmalloc rsrc tags
2021-05-24 23:50 [PATCH for-next 00/13] 5.14 batch 2 Pavel Begunkov
` (9 preceding siblings ...)
2021-05-24 23:51 ` [PATCH 10/13] io_uring: add helpers for 2 level table alloc Pavel Begunkov
@ 2021-05-24 23:51 ` Pavel Begunkov
2021-05-24 23:51 ` [PATCH 12/13] io_uring: cache task struct refs Pavel Begunkov
2021-05-24 23:51 ` [PATCH 13/13] io_uring: unify SQPOLL and user task cancellations Pavel Begunkov
12 siblings, 0 replies; 20+ messages in thread
From: Pavel Begunkov @ 2021-05-24 23:51 UTC (permalink / raw)
To: Jens Axboe, io-uring
We don't really need vmalloc for keeping tags, it's not a hot path and
is there out of convenience, so replace it with two level tables to not
litter kernel virtual memory mappings.
Signed-off-by: Pavel Begunkov <[email protected]>
---
fs/io_uring.c | 52 +++++++++++++++++++++++++++++++++++----------------
1 file changed, 36 insertions(+), 16 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1cc2d16637ff..2b2d70a58a87 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -100,6 +100,10 @@
#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
IORING_REGISTER_LAST + IORING_OP_LAST)
+#define IO_RSRC_TAG_TABLE_SHIFT 9
+#define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT)
+#define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1)
+
#define IORING_MAX_REG_BUFFERS (1U << 14)
#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
@@ -243,7 +247,8 @@ typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
struct io_rsrc_data {
struct io_ring_ctx *ctx;
- u64 *tags;
+ u64 **tags;
+ unsigned int nr;
rsrc_put_fn *do_put;
atomic_t refs;
struct completion done;
@@ -7172,9 +7177,20 @@ static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ct
return ret;
}
+static u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx)
+{
+ unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK;
+ unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT;
+
+ return &data->tags[table_idx][off];
+}
+
static void io_rsrc_data_free(struct io_rsrc_data *data)
{
- kvfree(data->tags);
+ size_t size = data->nr * sizeof(data->tags[0][0]);
+
+ if (data->tags)
+ io_free_page_table((void **)data->tags, size);
kfree(data);
}
@@ -7183,33 +7199,37 @@ static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
struct io_rsrc_data **pdata)
{
struct io_rsrc_data *data;
+ int ret = -ENOMEM;
unsigned i;
data = kzalloc(sizeof(*data), GFP_KERNEL);
if (!data)
return -ENOMEM;
-
- data->tags = kvcalloc(nr, sizeof(*data->tags), GFP_KERNEL);
+ data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
if (!data->tags) {
kfree(data);
return -ENOMEM;
}
+
+ data->nr = nr;
+ data->ctx = ctx;
+ data->do_put = do_put;
if (utags) {
+ ret = -EFAULT;
for (i = 0; i < nr; i++) {
- if (copy_from_user(&data->tags[i], &utags[i],
- sizeof(data->tags[i]))) {
- io_rsrc_data_free(data);
- return -EFAULT;
- }
+ if (copy_from_user(io_get_tag_slot(data, i), &utags[i],
+ sizeof(data->tags[i])))
+ goto fail;
}
}
atomic_set(&data->refs, 1);
- data->ctx = ctx;
- data->do_put = do_put;
init_completion(&data->done);
*pdata = data;
return 0;
+fail:
+ io_rsrc_data_free(data);
+ return ret;
}
static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
@@ -7678,7 +7698,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
/* allow sparse sets */
if (fd == -1) {
ret = -EINVAL;
- if (unlikely(ctx->file_data->tags[i]))
+ if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
goto out_fput;
continue;
}
@@ -7776,7 +7796,7 @@ static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
if (!prsrc)
return -ENOMEM;
- prsrc->tag = data->tags[idx];
+ prsrc->tag = *io_get_tag_slot(data, idx);
prsrc->rsrc = rsrc;
list_add(&prsrc->list, &node->rsrc_list);
return 0;
@@ -7846,7 +7866,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
err = -EBADF;
break;
}
- data->tags[up->offset + done] = tag;
+ *io_get_tag_slot(data, up->offset + done) = tag;
io_fixed_file_set(file_slot, file);
err = io_sqe_file_register(ctx, file, i);
if (err) {
@@ -8429,7 +8449,7 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
ret = io_buffer_validate(&iov);
if (ret)
break;
- if (!iov.iov_base && data->tags[i]) {
+ if (!iov.iov_base && *io_get_tag_slot(data, i)) {
ret = -EINVAL;
break;
}
@@ -8502,7 +8522,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
}
ctx->user_bufs[i] = imu;
- ctx->buf_data->tags[offset] = tag;
+ *io_get_tag_slot(ctx->buf_data, offset) = tag;
}
if (needs_switch)
--
2.31.1
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH 12/13] io_uring: cache task struct refs
2021-05-24 23:50 [PATCH for-next 00/13] 5.14 batch 2 Pavel Begunkov
` (10 preceding siblings ...)
2021-05-24 23:51 ` [PATCH 11/13] io_uring: don't vmalloc rsrc tags Pavel Begunkov
@ 2021-05-24 23:51 ` Pavel Begunkov
2021-05-27 21:51 ` Noah Goldstein
2021-05-24 23:51 ` [PATCH 13/13] io_uring: unify SQPOLL and user task cancellations Pavel Begunkov
12 siblings, 1 reply; 20+ messages in thread
From: Pavel Begunkov @ 2021-05-24 23:51 UTC (permalink / raw)
To: Jens Axboe, io-uring
tctx in submission part is always synchronised because is executed from
the task's context, so we can batch allocate tctx/task references and
store them across syscall boundaries. It avoids enough of operations,
including an atomic for getting task ref and a percpu_counter_add()
function call, which still fallback to spinlock for large batching
cases (around >=32). Should be good for SQPOLL submitting in small
portions and coming at some moment bpf submissions.
Signed-off-by: Pavel Begunkov <[email protected]>
---
fs/io_uring.c | 37 ++++++++++++++++++++++++++++---------
1 file changed, 28 insertions(+), 9 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 2b2d70a58a87..a95d55a0f9be 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -110,6 +110,8 @@
IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
IOSQE_BUFFER_SELECT)
+#define IO_TCTX_REFS_CACHE_NR (1U << 10)
+
struct io_uring {
u32 head ____cacheline_aligned_in_smp;
u32 tail ____cacheline_aligned_in_smp;
@@ -472,6 +474,7 @@ struct io_ring_ctx {
struct io_uring_task {
/* submission side */
+ int cached_refs;
struct xarray xa;
struct wait_queue_head wait;
const struct io_ring_ctx *last;
@@ -6702,16 +6705,23 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
{
+ struct io_uring_task *tctx;
int submitted = 0;
/* make sure SQ entry isn't read before tail */
nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
-
if (!percpu_ref_tryget_many(&ctx->refs, nr))
return -EAGAIN;
- percpu_counter_add(¤t->io_uring->inflight, nr);
- refcount_add(nr, ¤t->usage);
+ tctx = current->io_uring;
+ tctx->cached_refs -= nr;
+ if (unlikely(tctx->cached_refs < 0)) {
+ unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
+
+ percpu_counter_add(&tctx->inflight, refill);
+ refcount_add(refill, ¤t->usage);
+ tctx->cached_refs += refill;
+ }
io_submit_state_start(&ctx->submit_state, nr);
while (submitted < nr) {
@@ -6737,12 +6747,10 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
if (unlikely(submitted != nr)) {
int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
- struct io_uring_task *tctx = current->io_uring;
int unused = nr - ref_used;
+ current->io_uring->cached_refs += unused;
percpu_ref_put_many(&ctx->refs, unused);
- percpu_counter_sub(&tctx->inflight, unused);
- put_task_struct_many(current, unused);
}
io_submit_state_end(&ctx->submit_state, ctx);
@@ -7924,7 +7932,7 @@ static int io_uring_alloc_task_context(struct task_struct *task,
struct io_uring_task *tctx;
int ret;
- tctx = kmalloc(sizeof(*tctx), GFP_KERNEL);
+ tctx = kzalloc(sizeof(*tctx), GFP_KERNEL);
if (unlikely(!tctx))
return -ENOMEM;
@@ -7944,13 +7952,11 @@ static int io_uring_alloc_task_context(struct task_struct *task,
xa_init(&tctx->xa);
init_waitqueue_head(&tctx->wait);
- tctx->last = NULL;
atomic_set(&tctx->in_idle, 0);
atomic_set(&tctx->inflight_tracked, 0);
task->io_uring = tctx;
spin_lock_init(&tctx->task_lock);
INIT_WQ_LIST(&tctx->task_list);
- tctx->task_state = 0;
init_task_work(&tctx->task_work, tctx_task_work);
return 0;
}
@@ -7961,6 +7967,7 @@ void __io_uring_free(struct task_struct *tsk)
WARN_ON_ONCE(!xa_empty(&tctx->xa));
WARN_ON_ONCE(tctx->io_wq);
+ WARN_ON_ONCE(tctx->cached_refs);
percpu_counter_destroy(&tctx->inflight);
kfree(tctx);
@@ -9097,6 +9104,16 @@ static void io_uring_try_cancel(bool cancel_all)
}
}
+static void io_uring_drop_tctx_refs(struct task_struct *task)
+{
+ struct io_uring_task *tctx = task->io_uring;
+ unsigned int refs = tctx->cached_refs;
+
+ tctx->cached_refs = 0;
+ percpu_counter_sub(&tctx->inflight, refs);
+ put_task_struct_many(task, refs);
+}
+
/* should only be called by SQPOLL task */
static void io_uring_cancel_sqpoll(struct io_sq_data *sqd)
{
@@ -9112,6 +9129,7 @@ static void io_uring_cancel_sqpoll(struct io_sq_data *sqd)
WARN_ON_ONCE(!sqd || sqd->thread != current);
+ io_uring_drop_tctx_refs(current);
atomic_inc(&tctx->in_idle);
do {
/* read completions before cancelations */
@@ -9149,6 +9167,7 @@ void __io_uring_cancel(struct files_struct *files)
io_wq_exit_start(tctx->io_wq);
/* make sure overflow events are dropped */
+ io_uring_drop_tctx_refs(current);
atomic_inc(&tctx->in_idle);
do {
/* read completions before cancelations */
--
2.31.1
^ permalink raw reply related [flat|nested] 20+ messages in thread
* Re: [PATCH 12/13] io_uring: cache task struct refs
2021-05-24 23:51 ` [PATCH 12/13] io_uring: cache task struct refs Pavel Begunkov
@ 2021-05-27 21:51 ` Noah Goldstein
2021-05-27 22:13 ` Pavel Begunkov
0 siblings, 1 reply; 20+ messages in thread
From: Noah Goldstein @ 2021-05-27 21:51 UTC (permalink / raw)
To: Pavel Begunkov; +Cc: Jens Axboe, open list:IO_URING
On Mon, May 24, 2021 at 7:51 PM Pavel Begunkov <[email protected]> wrote:
>
> tctx in submission part is always synchronised because is executed from
> the task's context, so we can batch allocate tctx/task references and
> store them across syscall boundaries. It avoids enough of operations,
> including an atomic for getting task ref and a percpu_counter_add()
> function call, which still fallback to spinlock for large batching
> cases (around >=32). Should be good for SQPOLL submitting in small
> portions and coming at some moment bpf submissions.
>
> Signed-off-by: Pavel Begunkov <[email protected]>
> ---
> fs/io_uring.c | 37 ++++++++++++++++++++++++++++---------
> 1 file changed, 28 insertions(+), 9 deletions(-)
>
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 2b2d70a58a87..a95d55a0f9be 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -110,6 +110,8 @@
> IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
> IOSQE_BUFFER_SELECT)
>
> +#define IO_TCTX_REFS_CACHE_NR (1U << 10)
> +
> struct io_uring {
> u32 head ____cacheline_aligned_in_smp;
> u32 tail ____cacheline_aligned_in_smp;
> @@ -472,6 +474,7 @@ struct io_ring_ctx {
>
> struct io_uring_task {
> /* submission side */
> + int cached_refs;
> struct xarray xa;
> struct wait_queue_head wait;
> const struct io_ring_ctx *last;
> @@ -6702,16 +6705,23 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
>
> static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
> {
> + struct io_uring_task *tctx;
> int submitted = 0;
>
> /* make sure SQ entry isn't read before tail */
> nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
> -
> if (!percpu_ref_tryget_many(&ctx->refs, nr))
> return -EAGAIN;
>
> - percpu_counter_add(¤t->io_uring->inflight, nr);
> - refcount_add(nr, ¤t->usage);
> + tctx = current->io_uring;
> + tctx->cached_refs -= nr;
> + if (unlikely(tctx->cached_refs < 0)) {
> + unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
Might be cleared to use:
unsigned int refill = IO_TCTX_REFS_CACHE_NR - tctx->cached_refs;
> +
> + percpu_counter_add(&tctx->inflight, refill);
> + refcount_add(refill, ¤t->usage);
> + tctx->cached_refs += refill;
> + }
> io_submit_state_start(&ctx->submit_state, nr);
>
> while (submitted < nr) {
> @@ -6737,12 +6747,10 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
>
> if (unlikely(submitted != nr)) {
> int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
> - struct io_uring_task *tctx = current->io_uring;
> int unused = nr - ref_used;
>
> + current->io_uring->cached_refs += unused;
> percpu_ref_put_many(&ctx->refs, unused);
> - percpu_counter_sub(&tctx->inflight, unused);
> - put_task_struct_many(current, unused);
> }
>
> io_submit_state_end(&ctx->submit_state, ctx);
> @@ -7924,7 +7932,7 @@ static int io_uring_alloc_task_context(struct task_struct *task,
> struct io_uring_task *tctx;
> int ret;
>
> - tctx = kmalloc(sizeof(*tctx), GFP_KERNEL);
> + tctx = kzalloc(sizeof(*tctx), GFP_KERNEL);
> if (unlikely(!tctx))
> return -ENOMEM;
>
> @@ -7944,13 +7952,11 @@ static int io_uring_alloc_task_context(struct task_struct *task,
>
> xa_init(&tctx->xa);
> init_waitqueue_head(&tctx->wait);
> - tctx->last = NULL;
> atomic_set(&tctx->in_idle, 0);
> atomic_set(&tctx->inflight_tracked, 0);
> task->io_uring = tctx;
> spin_lock_init(&tctx->task_lock);
> INIT_WQ_LIST(&tctx->task_list);
> - tctx->task_state = 0;
> init_task_work(&tctx->task_work, tctx_task_work);
> return 0;
> }
> @@ -7961,6 +7967,7 @@ void __io_uring_free(struct task_struct *tsk)
>
> WARN_ON_ONCE(!xa_empty(&tctx->xa));
> WARN_ON_ONCE(tctx->io_wq);
> + WARN_ON_ONCE(tctx->cached_refs);
>
> percpu_counter_destroy(&tctx->inflight);
> kfree(tctx);
> @@ -9097,6 +9104,16 @@ static void io_uring_try_cancel(bool cancel_all)
> }
> }
>
> +static void io_uring_drop_tctx_refs(struct task_struct *task)
> +{
> + struct io_uring_task *tctx = task->io_uring;
> + unsigned int refs = tctx->cached_refs;
> +
> + tctx->cached_refs = 0;
> + percpu_counter_sub(&tctx->inflight, refs);
> + put_task_struct_many(task, refs);
> +}
> +
> /* should only be called by SQPOLL task */
> static void io_uring_cancel_sqpoll(struct io_sq_data *sqd)
> {
> @@ -9112,6 +9129,7 @@ static void io_uring_cancel_sqpoll(struct io_sq_data *sqd)
>
> WARN_ON_ONCE(!sqd || sqd->thread != current);
>
> + io_uring_drop_tctx_refs(current);
> atomic_inc(&tctx->in_idle);
> do {
> /* read completions before cancelations */
> @@ -9149,6 +9167,7 @@ void __io_uring_cancel(struct files_struct *files)
> io_wq_exit_start(tctx->io_wq);
>
> /* make sure overflow events are dropped */
> + io_uring_drop_tctx_refs(current);
> atomic_inc(&tctx->in_idle);
> do {
> /* read completions before cancelations */
> --
> 2.31.1
>
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 12/13] io_uring: cache task struct refs
2021-05-27 21:51 ` Noah Goldstein
@ 2021-05-27 22:13 ` Pavel Begunkov
0 siblings, 0 replies; 20+ messages in thread
From: Pavel Begunkov @ 2021-05-27 22:13 UTC (permalink / raw)
To: Noah Goldstein; +Cc: Jens Axboe, open list:IO_URING
On 5/27/21 10:51 PM, Noah Goldstein wrote:
> On Mon, May 24, 2021 at 7:51 PM Pavel Begunkov <[email protected]> wrote:
[...]
>>
>> - percpu_counter_add(¤t->io_uring->inflight, nr);
>> - refcount_add(nr, ¤t->usage);
>> + tctx = current->io_uring;
>> + tctx->cached_refs -= nr;
>> + if (unlikely(tctx->cached_refs < 0)) {
>> + unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
>
> Might be cleared to use:
>
> unsigned int refill = IO_TCTX_REFS_CACHE_NR - tctx->cached_refs;
=======================================================================
Did think about it, but left as is to emphasize that it's negative and
we convert it to positive, so all operations are with positive (and
unsigned due to C rules). Code generation will be same.
>> +
>> + percpu_counter_add(&tctx->inflight, refill);
>> + refcount_add(refill, ¤t->usage);
>> + tctx->cached_refs += refill;
>> + }
>> io_submit_state_start(&ctx->submit_state, nr);
>>
--
Pavel Begunkov
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 13/13] io_uring: unify SQPOLL and user task cancellations
2021-05-24 23:50 [PATCH for-next 00/13] 5.14 batch 2 Pavel Begunkov
` (11 preceding siblings ...)
2021-05-24 23:51 ` [PATCH 12/13] io_uring: cache task struct refs Pavel Begunkov
@ 2021-05-24 23:51 ` Pavel Begunkov
12 siblings, 0 replies; 20+ messages in thread
From: Pavel Begunkov @ 2021-05-24 23:51 UTC (permalink / raw)
To: Jens Axboe, io-uring
Merge io_uring_cancel_sqpoll() and __io_uring_cancel() as it's easier to
have a conditional ctx traverse inside than keeping them in sync.
Signed-off-by: Pavel Begunkov <[email protected]>
---
fs/io_uring.c | 89 +++++++++++++++++----------------------------------
1 file changed, 30 insertions(+), 59 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index a95d55a0f9be..7db6aaf31080 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1036,7 +1036,7 @@ static void io_uring_del_tctx_node(unsigned long index);
static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
struct task_struct *task,
bool cancel_all);
-static void io_uring_cancel_sqpoll(struct io_sq_data *sqd);
+static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
@@ -6921,7 +6921,7 @@ static int io_sq_thread(void *data)
timeout = jiffies + sqd->sq_thread_idle;
}
- io_uring_cancel_sqpoll(sqd);
+ io_uring_cancel_generic(true, sqd);
sqd->thread = NULL;
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
io_ring_set_wakeup_flag(ctx);
@@ -9089,21 +9089,6 @@ static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
return percpu_counter_sum(&tctx->inflight);
}
-static void io_uring_try_cancel(bool cancel_all)
-{
- struct io_uring_task *tctx = current->io_uring;
- struct io_tctx_node *node;
- unsigned long index;
-
- xa_for_each(&tctx->xa, index, node) {
- struct io_ring_ctx *ctx = node->ctx;
-
- /* sqpoll task will cancel all its requests */
- if (!ctx->sq_data)
- io_uring_try_cancel_requests(ctx, current, cancel_all);
- }
-}
-
static void io_uring_drop_tctx_refs(struct task_struct *task)
{
struct io_uring_task *tctx = task->io_uring;
@@ -9114,69 +9099,50 @@ static void io_uring_drop_tctx_refs(struct task_struct *task)
put_task_struct_many(task, refs);
}
-/* should only be called by SQPOLL task */
-static void io_uring_cancel_sqpoll(struct io_sq_data *sqd)
+/*
+ * Find any io_uring ctx that this task has registered or done IO on, and cancel
+ * requests. @sqd should be not-null IIF it's an SQPOLL thread cancellation.
+ */
+static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
{
struct io_uring_task *tctx = current->io_uring;
struct io_ring_ctx *ctx;
s64 inflight;
DEFINE_WAIT(wait);
+ WARN_ON_ONCE(sqd && sqd->thread != current);
+
if (!current->io_uring)
return;
if (tctx->io_wq)
io_wq_exit_start(tctx->io_wq);
- WARN_ON_ONCE(!sqd || sqd->thread != current);
-
io_uring_drop_tctx_refs(current);
atomic_inc(&tctx->in_idle);
do {
/* read completions before cancelations */
- inflight = tctx_inflight(tctx, false);
+ inflight = tctx_inflight(tctx, !cancel_all);
if (!inflight)
break;
- list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
- io_uring_try_cancel_requests(ctx, current, true);
- prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
- /*
- * If we've seen completions, retry without waiting. This
- * avoids a race where a completion comes in before we did
- * prepare_to_wait().
- */
- if (inflight == tctx_inflight(tctx, false))
- schedule();
- finish_wait(&tctx->wait, &wait);
- } while (1);
- atomic_dec(&tctx->in_idle);
-}
+ if (!sqd) {
+ struct io_tctx_node *node;
+ unsigned long index;
-/*
- * Find any io_uring fd that this task has registered or done IO on, and cancel
- * requests.
- */
-void __io_uring_cancel(struct files_struct *files)
-{
- struct io_uring_task *tctx = current->io_uring;
- DEFINE_WAIT(wait);
- s64 inflight;
- bool cancel_all = !files;
-
- if (tctx->io_wq)
- io_wq_exit_start(tctx->io_wq);
+ xa_for_each(&tctx->xa, index, node) {
+ /* sqpoll task will cancel all its requests */
+ if (node->ctx->sq_data)
+ continue;
+ io_uring_try_cancel_requests(node->ctx, current,
+ cancel_all);
+ }
+ } else {
+ list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
+ io_uring_try_cancel_requests(ctx, current,
+ cancel_all);
+ }
- /* make sure overflow events are dropped */
- io_uring_drop_tctx_refs(current);
- atomic_inc(&tctx->in_idle);
- do {
- /* read completions before cancelations */
- inflight = tctx_inflight(tctx, !cancel_all);
- if (!inflight)
- break;
- io_uring_try_cancel(cancel_all);
prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
-
/*
* If we've seen completions, retry without waiting. This
* avoids a race where a completion comes in before we did
@@ -9195,6 +9161,11 @@ void __io_uring_cancel(struct files_struct *files)
}
}
+void __io_uring_cancel(struct files_struct *files)
+{
+ io_uring_cancel_generic(!files, NULL);
+}
+
static void *io_uring_validate_mmap_request(struct file *file,
loff_t pgoff, size_t sz)
{
--
2.31.1
^ permalink raw reply related [flat|nested] 20+ messages in thread