* [PATCH v3 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline
@ 2020-05-28 9:15 Xiaoguang Wang
2020-05-28 9:15 ` [PATCH v3 2/2] io_uring: avoid unnecessary io_wq_work copy for fast poll feature Xiaoguang Wang
` (2 more replies)
0 siblings, 3 replies; 9+ messages in thread
From: Xiaoguang Wang @ 2020-05-28 9:15 UTC (permalink / raw)
To: io-uring; +Cc: axboe, asml.silence, joseph.qi, Xiaoguang Wang
If requests can be submitted and completed inline, we don't need to
initialize whole io_wq_work in io_init_req(), which is an expensive
operation, add a new 'REQ_F_WORK_INITIALIZED' to control whether
io_wq_work is initialized.
I use /dev/nullb0 to evaluate performance improvement in my physical
machine:
modprobe null_blk nr_devices=1 completion_nsec=0
sudo taskset -c 60 fio -name=fiotest -filename=/dev/nullb0 -iodepth=128
-thread -rw=read -ioengine=io_uring -direct=1 -bs=4k -size=100G -numjobs=1
-time_based -runtime=120
before this patch:
Run status group 0 (all jobs):
READ: bw=724MiB/s (759MB/s), 724MiB/s-724MiB/s (759MB/s-759MB/s),
io=84.8GiB (91.1GB), run=120001-120001msec
With this patch:
Run status group 0 (all jobs):
READ: bw=761MiB/s (798MB/s), 761MiB/s-761MiB/s (798MB/s-798MB/s),
io=89.2GiB (95.8GB), run=120001-120001msec
About 5% improvement.
Signed-off-by: Xiaoguang Wang <[email protected]>
---
fs/io-wq.h | 5 ----
fs/io_uring.c | 78 ++++++++++++++++++++++++++++++++++++++++-----------
2 files changed, 62 insertions(+), 21 deletions(-)
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 5ba12de7572f..3d85d365d764 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -94,11 +94,6 @@ struct io_wq_work {
pid_t task_pid;
};
-#define INIT_IO_WORK(work, _func) \
- do { \
- *(work) = (struct io_wq_work){ .func = _func }; \
- } while (0) \
-
static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
{
if (!work->list.next)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 2af87f73848e..7ba8590a45a6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -535,6 +535,7 @@ enum {
REQ_F_POLLED_BIT,
REQ_F_BUFFER_SELECTED_BIT,
REQ_F_NO_FILE_TABLE_BIT,
+ REQ_F_WORK_INITIALIZED_BIT,
/* not a real bit, just to check we're not overflowing the space */
__REQ_F_LAST_BIT,
@@ -590,6 +591,8 @@ enum {
REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),
/* doesn't need file table for this request */
REQ_F_NO_FILE_TABLE = BIT(REQ_F_NO_FILE_TABLE_BIT),
+ /* io_wq_work is initialized */
+ REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT),
};
struct async_poll {
@@ -635,6 +638,7 @@ struct io_kiocb {
unsigned int flags;
refcount_t refs;
struct task_struct *task;
+ const struct cred *creds;
unsigned long fsize;
u64 user_data;
u32 result;
@@ -882,6 +886,12 @@ static struct kmem_cache *req_cachep;
static const struct file_operations io_uring_fops;
+static inline void init_io_work(struct io_kiocb *req,
+ void (*func)(struct io_wq_work **))
+{
+ req->work = (struct io_wq_work){ .func = func };
+ req->flags |= REQ_F_WORK_INITIALIZED;
+}
struct sock *io_uring_get_socket(struct file *file)
{
#if defined(CONFIG_UNIX)
@@ -1035,8 +1045,15 @@ static inline void io_req_work_grab_env(struct io_kiocb *req,
mmgrab(current->mm);
req->work.mm = current->mm;
}
- if (!req->work.creds)
- req->work.creds = get_current_cred();
+
+ if (!req->work.creds) {
+ if (!req->creds)
+ req->work.creds = get_current_cred();
+ else {
+ req->work.creds = req->creds;
+ req->creds = NULL;
+ }
+ }
if (!req->work.fs && def->needs_fs) {
spin_lock(¤t->fs->lock);
if (!current->fs->in_exec) {
@@ -1053,6 +1070,9 @@ static inline void io_req_work_grab_env(struct io_kiocb *req,
static inline void io_req_work_drop_env(struct io_kiocb *req)
{
+ if (!(req->flags & REQ_F_WORK_INITIALIZED))
+ return;
+
if (req->work.mm) {
mmdrop(req->work.mm);
req->work.mm = NULL;
@@ -2923,7 +2943,10 @@ static int io_fsync(struct io_kiocb *req, bool force_nonblock)
{
/* fsync always requires a blocking context */
if (force_nonblock) {
- req->work.func = io_fsync_finish;
+ if (!(req->flags & REQ_F_WORK_INITIALIZED))
+ init_io_work(req, io_fsync_finish);
+ else
+ req->work.func = io_fsync_finish;
return -EAGAIN;
}
__io_fsync(req);
@@ -2971,7 +2994,10 @@ static int io_fallocate(struct io_kiocb *req, bool force_nonblock)
{
/* fallocate always requiring blocking context */
if (force_nonblock) {
- req->work.func = io_fallocate_finish;
+ if (!(req->flags & REQ_F_WORK_INITIALIZED))
+ init_io_work(req, io_fallocate_finish);
+ else
+ req->work.func = io_fallocate_finish;
return -EAGAIN;
}
@@ -3500,7 +3526,10 @@ static int io_close(struct io_kiocb *req, bool force_nonblock)
/* submission ref will be dropped, take it for async */
refcount_inc(&req->refs);
- req->work.func = io_close_finish;
+ if (!(req->flags & REQ_F_WORK_INITIALIZED))
+ init_io_work(req, io_close_finish);
+ else
+ req->work.func = io_close_finish;
/*
* Do manual async queue here to avoid grabbing files - we don't
* need the files, and it'll cause io_close_finish() to close
@@ -3563,7 +3592,10 @@ static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
{
/* sync_file_range always requires a blocking context */
if (force_nonblock) {
- req->work.func = io_sync_file_range_finish;
+ if (!(req->flags & REQ_F_WORK_INITIALIZED))
+ init_io_work(req, io_sync_file_range_finish);
+ else
+ req->work.func = io_sync_file_range_finish;
return -EAGAIN;
}
@@ -4032,7 +4064,10 @@ static int io_accept(struct io_kiocb *req, bool force_nonblock)
ret = __io_accept(req, force_nonblock);
if (ret == -EAGAIN && force_nonblock) {
- req->work.func = io_accept_finish;
+ if (!(req->flags & REQ_F_WORK_INITIALIZED))
+ init_io_work(req, io_accept_finish);
+ else
+ req->work.func = io_accept_finish;
return -EAGAIN;
}
return 0;
@@ -5032,6 +5067,9 @@ static int io_req_defer_prep(struct io_kiocb *req,
if (!sqe)
return 0;
+ if (!(req->flags & REQ_F_WORK_INITIALIZED))
+ init_io_work(req, io_wq_submit_work);
+
if (io_op_defs[req->opcode].file_table) {
ret = io_grab_files(req);
if (unlikely(ret))
@@ -5667,19 +5705,24 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_kiocb *linked_timeout;
struct io_kiocb *nxt;
- const struct cred *old_creds = NULL;
+ const struct cred *creds, *old_creds = NULL;
int ret;
again:
linked_timeout = io_prep_linked_timeout(req);
- if (req->work.creds && req->work.creds != current_cred()) {
+ if (req->flags & REQ_F_WORK_INITIALIZED)
+ creds = req->work.creds;
+ else
+ creds = req->creds;
+
+ if (creds && creds != current_cred()) {
if (old_creds)
revert_creds(old_creds);
- if (old_creds == req->work.creds)
+ if (old_creds == creds)
old_creds = NULL; /* restored original creds */
else
- old_creds = override_creds(req->work.creds);
+ old_creds = override_creds(creds);
}
ret = io_issue_sqe(req, sqe, true);
@@ -5696,6 +5739,9 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
goto exit;
}
punt:
+ if (!(req->flags & REQ_F_WORK_INITIALIZED))
+ init_io_work(req, io_wq_submit_work);
+
if (io_op_defs[req->opcode].file_table) {
ret = io_grab_files(req);
if (ret)
@@ -5948,7 +5994,6 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
refcount_set(&req->refs, 2);
req->task = NULL;
req->result = 0;
- INIT_IO_WORK(&req->work, io_wq_submit_work);
if (unlikely(req->opcode >= IORING_OP_LAST))
return -EINVAL;
@@ -5970,11 +6015,12 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
id = READ_ONCE(sqe->personality);
if (id) {
- req->work.creds = idr_find(&ctx->personality_idr, id);
- if (unlikely(!req->work.creds))
+ req->creds = idr_find(&ctx->personality_idr, id);
+ if (unlikely(!req->creds))
return -EINVAL;
- get_cred(req->work.creds);
- }
+ get_cred(req->creds);
+ } else
+ req->creds = NULL;
/* same numerical values with corresponding REQ_F_*, safe to copy */
req->flags |= sqe_flags;
--
2.17.2
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH v3 2/2] io_uring: avoid unnecessary io_wq_work copy for fast poll feature
2020-05-28 9:15 [PATCH v3 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline Xiaoguang Wang
@ 2020-05-28 9:15 ` Xiaoguang Wang
2020-05-28 17:03 ` [PATCH v3 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline Jens Axboe
2020-05-29 8:58 ` Pavel Begunkov
2 siblings, 0 replies; 9+ messages in thread
From: Xiaoguang Wang @ 2020-05-28 9:15 UTC (permalink / raw)
To: io-uring; +Cc: axboe, asml.silence, joseph.qi, Xiaoguang Wang
Basically IORING_OP_POLL_ADD command and async armed poll handlers
for regular commands don't touch io_wq_work, so only REQ_F_WORK_INITIALIZED
is set, can we do io_wq_work copy and restore.
Signed-off-by: Xiaoguang Wang <[email protected]>
---
V3:
drop the REQ_F_WORK_NEED_RESTORE flag introduced in V2 patch, just
use REQ_F_WORK_INITIALIZED to control whether to do io_wq_work copy
and restore.
---
fs/io_uring.c | 13 +++++++++----
1 file changed, 9 insertions(+), 4 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 7ba8590a45a6..cade241568c6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4411,7 +4411,8 @@ static void io_async_task_func(struct callback_head *cb)
spin_unlock_irq(&ctx->completion_lock);
/* restore ->work in case we need to retry again */
- memcpy(&req->work, &apoll->work, sizeof(req->work));
+ if (req->flags & REQ_F_WORK_INITIALIZED)
+ memcpy(&req->work, &apoll->work, sizeof(req->work));
kfree(apoll);
if (!canceled) {
@@ -4508,7 +4509,8 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
return false;
req->flags |= REQ_F_POLLED;
- memcpy(&apoll->work, &req->work, sizeof(req->work));
+ if (req->flags & REQ_F_WORK_INITIALIZED)
+ memcpy(&apoll->work, &req->work, sizeof(req->work));
had_io = req->io != NULL;
get_task_struct(current);
@@ -4533,7 +4535,8 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
if (!had_io)
io_poll_remove_double(req);
spin_unlock_irq(&ctx->completion_lock);
- memcpy(&req->work, &apoll->work, sizeof(req->work));
+ if (req->flags & REQ_F_WORK_INITIALIZED)
+ memcpy(&req->work, &apoll->work, sizeof(req->work));
kfree(apoll);
return false;
}
@@ -4578,7 +4581,9 @@ static bool io_poll_remove_one(struct io_kiocb *req)
* io_req_work_drop_env below when dropping the
* final reference.
*/
- memcpy(&req->work, &apoll->work, sizeof(req->work));
+ if (req->flags & REQ_F_WORK_INITIALIZED)
+ memcpy(&req->work, &apoll->work,
+ sizeof(req->work));
kfree(apoll);
}
}
--
2.17.2
^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [PATCH v3 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline
2020-05-28 9:15 [PATCH v3 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline Xiaoguang Wang
2020-05-28 9:15 ` [PATCH v3 2/2] io_uring: avoid unnecessary io_wq_work copy for fast poll feature Xiaoguang Wang
@ 2020-05-28 17:03 ` Jens Axboe
2020-05-29 3:58 ` Xiaoguang Wang
2020-05-29 8:58 ` Pavel Begunkov
2 siblings, 1 reply; 9+ messages in thread
From: Jens Axboe @ 2020-05-28 17:03 UTC (permalink / raw)
To: Xiaoguang Wang, io-uring; +Cc: asml.silence, joseph.qi
On 5/28/20 3:15 AM, Xiaoguang Wang wrote:
> If requests can be submitted and completed inline, we don't need to
> initialize whole io_wq_work in io_init_req(), which is an expensive
> operation, add a new 'REQ_F_WORK_INITIALIZED' to control whether
> io_wq_work is initialized.
>
> I use /dev/nullb0 to evaluate performance improvement in my physical
> machine:
> modprobe null_blk nr_devices=1 completion_nsec=0
> sudo taskset -c 60 fio -name=fiotest -filename=/dev/nullb0 -iodepth=128
> -thread -rw=read -ioengine=io_uring -direct=1 -bs=4k -size=100G -numjobs=1
> -time_based -runtime=120
>
> before this patch:
> Run status group 0 (all jobs):
> READ: bw=724MiB/s (759MB/s), 724MiB/s-724MiB/s (759MB/s-759MB/s),
> io=84.8GiB (91.1GB), run=120001-120001msec
>
> With this patch:
> Run status group 0 (all jobs):
> READ: bw=761MiB/s (798MB/s), 761MiB/s-761MiB/s (798MB/s-798MB/s),
> io=89.2GiB (95.8GB), run=120001-120001msec
>
> About 5% improvement.
I think this is a big enough of a win to warrant looking closer
at this. Just a quick comment from me so far:
> @@ -2923,7 +2943,10 @@ static int io_fsync(struct io_kiocb *req, bool force_nonblock)
> {
> /* fsync always requires a blocking context */
> if (force_nonblock) {
> - req->work.func = io_fsync_finish;
> + if (!(req->flags & REQ_F_WORK_INITIALIZED))
> + init_io_work(req, io_fsync_finish);
> + else
> + req->work.func = io_fsync_finish;
This pattern is repeated enough to warrant a helper, ala:
static void io_req_init_async(req, func)
{
if (req->flags & REQ_F_WORK_INITIALIZED)
req->work.func = func;
else
init_io_work(req, func);
}
also swapped the conditions, I tend to find it easier to read without
the negation.
--
Jens Axboe
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v3 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline
2020-05-28 17:03 ` [PATCH v3 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline Jens Axboe
@ 2020-05-29 3:58 ` Xiaoguang Wang
0 siblings, 0 replies; 9+ messages in thread
From: Xiaoguang Wang @ 2020-05-29 3:58 UTC (permalink / raw)
To: Jens Axboe, io-uring; +Cc: asml.silence, joseph.qi
hi,
> On 5/28/20 3:15 AM, Xiaoguang Wang wrote:
>> If requests can be submitted and completed inline, we don't need to
>> initialize whole io_wq_work in io_init_req(), which is an expensive
>> operation, add a new 'REQ_F_WORK_INITIALIZED' to control whether
>> io_wq_work is initialized.
>>
>> I use /dev/nullb0 to evaluate performance improvement in my physical
>> machine:
>> modprobe null_blk nr_devices=1 completion_nsec=0
>> sudo taskset -c 60 fio -name=fiotest -filename=/dev/nullb0 -iodepth=128
>> -thread -rw=read -ioengine=io_uring -direct=1 -bs=4k -size=100G -numjobs=1
>> -time_based -runtime=120
>>
>> before this patch:
>> Run status group 0 (all jobs):
>> READ: bw=724MiB/s (759MB/s), 724MiB/s-724MiB/s (759MB/s-759MB/s),
>> io=84.8GiB (91.1GB), run=120001-120001msec
>>
>> With this patch:
>> Run status group 0 (all jobs):
>> READ: bw=761MiB/s (798MB/s), 761MiB/s-761MiB/s (798MB/s-798MB/s),
>> io=89.2GiB (95.8GB), run=120001-120001msec
>>
>> About 5% improvement.
>
> I think this is a big enough of a win to warrant looking closer
> at this. Just a quick comment from me so far:
Yeah, to be honest, I did't expect that we get this some obvious improvement.
But I have run multiple rounds of same tests, I always get similar improvement,
if you have some free time, you can have a test :)
>
>> @@ -2923,7 +2943,10 @@ static int io_fsync(struct io_kiocb *req, bool force_nonblock)
>> {
>> /* fsync always requires a blocking context */
>> if (force_nonblock) {
>> - req->work.func = io_fsync_finish;
>> + if (!(req->flags & REQ_F_WORK_INITIALIZED))
>> + init_io_work(req, io_fsync_finish);
>> + else
>> + req->work.func = io_fsync_finish;
>
> This pattern is repeated enough to warrant a helper, ala:
>
> static void io_req_init_async(req, func)
> {
> if (req->flags & REQ_F_WORK_INITIALIZED)
> req->work.func = func;
> else
> init_io_work(req, func);
> }
>
> also swapped the conditions, I tend to find it easier to read without
> the negation.
Thanks for your suggestions. I'll prepare a V4 soon.
Regards,
Xiaoguang Wang
>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v3 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline
2020-05-28 9:15 [PATCH v3 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline Xiaoguang Wang
2020-05-28 9:15 ` [PATCH v3 2/2] io_uring: avoid unnecessary io_wq_work copy for fast poll feature Xiaoguang Wang
2020-05-28 17:03 ` [PATCH v3 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline Jens Axboe
@ 2020-05-29 8:58 ` Pavel Begunkov
2020-05-29 14:27 ` Jens Axboe
2020-05-30 13:36 ` Xiaoguang Wang
2 siblings, 2 replies; 9+ messages in thread
From: Pavel Begunkov @ 2020-05-29 8:58 UTC (permalink / raw)
To: Xiaoguang Wang, io-uring; +Cc: axboe, joseph.qi
On 28/05/2020 12:15, Xiaoguang Wang wrote:
> If requests can be submitted and completed inline, we don't need to
> initialize whole io_wq_work in io_init_req(), which is an expensive
> operation, add a new 'REQ_F_WORK_INITIALIZED' to control whether
> io_wq_work is initialized.
It looks nicer. Especially if you'd add a helper as Jens supposed.
The other thing, even though I hate treating a part of the fields differently
from others, I don't like ->creds tossing either.
Did you consider trying using only ->work.creds without adding req->creds? like
in the untested incremental below. init_io_work() there is misleading, should be
somehow played around better.
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 4dd3295d74f6..4086561ce444 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -643,7 +643,6 @@ struct io_kiocb {
unsigned int flags;
refcount_t refs;
struct task_struct *task;
- const struct cred *creds;
unsigned long fsize;
u64 user_data;
u32 result;
@@ -894,8 +893,16 @@ static const struct file_operations io_uring_fops;
static inline void init_io_work(struct io_kiocb *req,
void (*func)(struct io_wq_work **))
{
- req->work = (struct io_wq_work){ .func = func };
- req->flags |= REQ_F_WORK_INITIALIZED;
+ struct io_wq_work *work = &req->work;
+
+ /* work->creds are already initialised by a user */
+ work->list.next = NULL;
+ work->func = func;
+ work->files = NULL;
+ work->mm = NULL;
+ work->fs = NULL;
+ work->flags = REQ_F_WORK_INITIALIZED;
+ work->task_pid = 0;
}
struct sock *io_uring_get_socket(struct file *file)
{
@@ -1019,15 +1026,9 @@ static inline void io_req_work_grab_env(struct io_kiocb *req,
mmgrab(current->mm);
req->work.mm = current->mm;
}
+ if (!req->work.creds)
+ req->work.creds = get_current_cred();
- if (!req->work.creds) {
- if (!req->creds)
- req->work.creds = get_current_cred();
- else {
- req->work.creds = req->creds;
- req->creds = NULL;
- }
- }
if (!req->work.fs && def->needs_fs) {
spin_lock(¤t->fs->lock);
if (!current->fs->in_exec) {
@@ -1044,6 +1045,12 @@ static inline void io_req_work_grab_env(struct io_kiocb *req,
static inline void io_req_work_drop_env(struct io_kiocb *req)
{
+ /* always init'ed, put before REQ_F_WORK_INITIALIZED check */
+ if (req->work.creds) {
+ put_cred(req->work.creds);
+ req->work.creds = NULL;
+ }
+
if (!(req->flags & REQ_F_WORK_INITIALIZED))
return;
@@ -1051,10 +1058,6 @@ static inline void io_req_work_drop_env(struct io_kiocb *req)
mmdrop(req->work.mm);
req->work.mm = NULL;
}
- if (req->work.creds) {
- put_cred(req->work.creds);
- req->work.creds = NULL;
- }
if (req->work.fs) {
struct fs_struct *fs = req->work.fs;
@@ -5901,12 +5904,12 @@ static int io_init_req(struct io_ring_ctx *ctx, struct
io_kiocb *req,
id = READ_ONCE(sqe->personality);
if (id) {
- req->creds = idr_find(&ctx->personality_idr, id);
- if (unlikely(!req->creds))
+ req->work.creds = idr_find(&ctx->personality_idr, id);
+ if (unlikely(!req->work.creds))
return -EINVAL;
- get_cred(req->creds);
+ get_cred(req->work.creds);
} else
- req->creds = NULL;
+ req->work.creds = NULL;
/* same numerical values with corresponding REQ_F_*, safe to copy */
req->flags |= sqe_flags;
--
Pavel Begunkov
^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [PATCH v3 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline
2020-05-29 8:58 ` Pavel Begunkov
@ 2020-05-29 14:27 ` Jens Axboe
2020-05-30 13:36 ` Xiaoguang Wang
1 sibling, 0 replies; 9+ messages in thread
From: Jens Axboe @ 2020-05-29 14:27 UTC (permalink / raw)
To: Pavel Begunkov, Xiaoguang Wang, io-uring; +Cc: joseph.qi
On 5/29/20 2:58 AM, Pavel Begunkov wrote:
> On 28/05/2020 12:15, Xiaoguang Wang wrote:
>> If requests can be submitted and completed inline, we don't need to
>> initialize whole io_wq_work in io_init_req(), which is an expensive
>> operation, add a new 'REQ_F_WORK_INITIALIZED' to control whether
>> io_wq_work is initialized.
>
> It looks nicer. Especially if you'd add a helper as Jens supposed.
>
> The other thing, even though I hate treating a part of the fields differently
> from others, I don't like ->creds tossing either.
>
> Did you consider trying using only ->work.creds without adding req->creds? like
> in the untested incremental below. init_io_work() there is misleading, should be
> somehow played around better.
I had that thought too when reading the patchset, would be nice _not_ to have
to add a new creds field.
--
Jens Axboe
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v3 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline
2020-05-29 8:58 ` Pavel Begunkov
2020-05-29 14:27 ` Jens Axboe
@ 2020-05-30 13:36 ` Xiaoguang Wang
2020-05-30 13:43 ` Jens Axboe
2020-05-30 19:38 ` Pavel Begunkov
1 sibling, 2 replies; 9+ messages in thread
From: Xiaoguang Wang @ 2020-05-30 13:36 UTC (permalink / raw)
To: Pavel Begunkov, io-uring; +Cc: axboe, joseph.qi
hi,
> On 28/05/2020 12:15, Xiaoguang Wang wrote:
>> If requests can be submitted and completed inline, we don't need to
>> initialize whole io_wq_work in io_init_req(), which is an expensive
>> operation, add a new 'REQ_F_WORK_INITIALIZED' to control whether
>> io_wq_work is initialized.
>
> It looks nicer. Especially if you'd add a helper as Jens supposed.
Sure, I'll add a helper in V4, thanks.
>
> The other thing, even though I hate treating a part of the fields differently
> from others, I don't like ->creds tossing either.
>
> Did you consider trying using only ->work.creds without adding req->creds? like
> in the untested incremental below. init_io_work() there is misleading, should be
> somehow played around better.
But if not adding a new req->creds, I think there will be some potential risks.
In current io_uring mainline codes, look at io_kiocb's memory layout
crash> struct -o io_kiocb
struct io_kiocb {
union {
[0] struct file *file;
[0] struct io_rw rw;
[0] struct io_poll_iocb poll;
[0] struct io_accept accept;
[0] struct io_sync sync;
[0] struct io_cancel cancel;
[0] struct io_timeout timeout;
[0] struct io_connect connect;
[0] struct io_sr_msg sr_msg;
[0] struct io_open open;
[0] struct io_close close;
[0] struct io_files_update files_update;
[0] struct io_fadvise fadvise;
[0] struct io_madvise madvise;
[0] struct io_epoll epoll;
[0] struct io_splice splice;
[0] struct io_provide_buf pbuf;
};
[64] struct io_async_ctx *io;
[72] int cflags;
[76] u8 opcode;
[78] u16 buf_index;
[80] struct io_ring_ctx *ctx;
[88] struct list_head list;
[104] unsigned int flags;
[108] refcount_t refs;
[112] struct task_struct *task;
[120] unsigned long fsize;
[128] u64 user_data;
[136] u32 result;
[140] u32 sequence;
[144] struct list_head link_list;
[160] struct list_head inflight_entry;
[176] struct percpu_ref *fixed_file_refs;
union {
struct {
[184] struct callback_head task_work;
[200] struct hlist_node hash_node;
[216] struct async_poll *apoll;
};
[184] struct io_wq_work work;
};
}
SIZE: 240
struct io_wq_work {
[0] struct io_wq_work_node list;
[8] void (*func)(struct io_wq_work **);
[16] struct files_struct *files;
[24] struct mm_struct *mm;
[32] const struct cred *creds;
[40] struct fs_struct *fs;
[48] unsigned int flags;
[52] pid_t task_pid;
}
SIZE: 56
The risk mainly comes from the union:
union {
/*
* Only commands that never go async can use the below fields,
* obviously. Right now only IORING_OP_POLL_ADD uses them, and
* async armed poll handlers for regular commands. The latter
* restore the work, if needed.
*/
struct {
struct callback_head task_work;
struct hlist_node hash_node;
struct async_poll *apoll;
};
struct io_wq_work work;
};
1, apoll and creds are in same memory offset, for 'async armed poll handlers' case,
apoll will be used, that means creds will be overwrited. In patch "io_uring: avoid
unnecessary io_wq_work copy for fast poll feature", I use REQ_F_WORK_INITIALIZED
to control whether to do io_wq_work restore, then your below codes will break:
static inline void io_req_work_drop_env(struct io_kiocb *req)
{
/* always init'ed, put before REQ_F_WORK_INITIALIZED check */
if (req->work.creds) {
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Here req->work.creds will be invalid, or I still need to use some space
to record original req->work.creds, and do creds restore.
put_cred(req->work.creds);
req->work.creds = NULL;
}
if (!(req->flags & REQ_F_WORK_INITIALIZED))
return;
2, For IORING_OP_POLL_ADD case, current mainline codes will use task_work and hash_node,
32 bytes, that means io_wq_work's member list, func, files and mm would be overwrited,
but will not touch creds, it's safe now. But if we will add some new member to
struct {
struct callback_head task_work;
struct hlist_node hash_node;
struct async_poll *apoll;
};
say callback_head adds a new member, our check will still break.
3. IMO, io_wq_work is just to describe needed running environment for reqs that will be
punted to io-wq, for reqs submitted and completed inline should not touch this struct
from software design view, and current io_kiocb is 240 bytes, and a new pointer will be
248 bytes, still 4 cache lines for cache line 64 bytes.
Regards,
Xiaoguang Wang
>
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 4dd3295d74f6..4086561ce444 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -643,7 +643,6 @@ struct io_kiocb {
> unsigned int flags;
> refcount_t refs;
> struct task_struct *task;
> - const struct cred *creds;
> unsigned long fsize;
> u64 user_data;
> u32 result;
> @@ -894,8 +893,16 @@ static const struct file_operations io_uring_fops;
> static inline void init_io_work(struct io_kiocb *req,
> void (*func)(struct io_wq_work **))
> {
> - req->work = (struct io_wq_work){ .func = func };
> - req->flags |= REQ_F_WORK_INITIALIZED;
> + struct io_wq_work *work = &req->work;
> +
> + /* work->creds are already initialised by a user */
> + work->list.next = NULL;
> + work->func = func;
> + work->files = NULL;
> + work->mm = NULL;
> + work->fs = NULL;
> + work->flags = REQ_F_WORK_INITIALIZED;
> + work->task_pid = 0;
> }
> struct sock *io_uring_get_socket(struct file *file)
> {
> @@ -1019,15 +1026,9 @@ static inline void io_req_work_grab_env(struct io_kiocb *req,
> mmgrab(current->mm);
> req->work.mm = current->mm;
> }
> + if (!req->work.creds)
> + req->work.creds = get_current_cred();
>
> - if (!req->work.creds) {
> - if (!req->creds)
> - req->work.creds = get_current_cred();
> - else {
> - req->work.creds = req->creds;
> - req->creds = NULL;
> - }
> - }
> if (!req->work.fs && def->needs_fs) {
> spin_lock(¤t->fs->lock);
> if (!current->fs->in_exec) {
> @@ -1044,6 +1045,12 @@ static inline void io_req_work_grab_env(struct io_kiocb *req,
>
> static inline void io_req_work_drop_env(struct io_kiocb *req)
> {
> + /* always init'ed, put before REQ_F_WORK_INITIALIZED check */
> + if (req->work.creds) {
> + put_cred(req->work.creds);
> + req->work.creds = NULL;
> + }
> +
> if (!(req->flags & REQ_F_WORK_INITIALIZED))
> return;
>
> @@ -1051,10 +1058,6 @@ static inline void io_req_work_drop_env(struct io_kiocb *req)
> mmdrop(req->work.mm);
> req->work.mm = NULL;
> }
> - if (req->work.creds) {
> - put_cred(req->work.creds);
> - req->work.creds = NULL;
> - }
> if (req->work.fs) {
> struct fs_struct *fs = req->work.fs;
>
> @@ -5901,12 +5904,12 @@ static int io_init_req(struct io_ring_ctx *ctx, struct
> io_kiocb *req,
>
> id = READ_ONCE(sqe->personality);
> if (id) {
> - req->creds = idr_find(&ctx->personality_idr, id);
> - if (unlikely(!req->creds))
> + req->work.creds = idr_find(&ctx->personality_idr, id);
> + if (unlikely(!req->work.creds))
> return -EINVAL;
> - get_cred(req->creds);
> + get_cred(req->work.creds);
> } else
> - req->creds = NULL;
> + req->work.creds = NULL;
>
> /* same numerical values with corresponding REQ_F_*, safe to copy */
> req->flags |= sqe_flags;
>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v3 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline
2020-05-30 13:36 ` Xiaoguang Wang
@ 2020-05-30 13:43 ` Jens Axboe
2020-05-30 19:38 ` Pavel Begunkov
1 sibling, 0 replies; 9+ messages in thread
From: Jens Axboe @ 2020-05-30 13:43 UTC (permalink / raw)
To: Xiaoguang Wang, Pavel Begunkov, io-uring; +Cc: joseph.qi
On 5/30/20 7:36 AM, Xiaoguang Wang wrote:
> hi,
>
>> On 28/05/2020 12:15, Xiaoguang Wang wrote:
>>> If requests can be submitted and completed inline, we don't need to
>>> initialize whole io_wq_work in io_init_req(), which is an expensive
>>> operation, add a new 'REQ_F_WORK_INITIALIZED' to control whether
>>> io_wq_work is initialized.
>>
>> It looks nicer. Especially if you'd add a helper as Jens supposed.
> Sure, I'll add a helper in V4, thanks.
>
>>
>> The other thing, even though I hate treating a part of the fields differently
>> from others, I don't like ->creds tossing either.
>>
>> Did you consider trying using only ->work.creds without adding req->creds? like
>> in the untested incremental below. init_io_work() there is misleading, should be
>> somehow played around better.
> But if not adding a new req->creds, I think there will be some potential risks.
> In current io_uring mainline codes, look at io_kiocb's memory layout
> crash> struct -o io_kiocb
> struct io_kiocb {
> union {
> [0] struct file *file;
> [0] struct io_rw rw;
> [0] struct io_poll_iocb poll;
> [0] struct io_accept accept;
> [0] struct io_sync sync;
> [0] struct io_cancel cancel;
> [0] struct io_timeout timeout;
> [0] struct io_connect connect;
> [0] struct io_sr_msg sr_msg;
> [0] struct io_open open;
> [0] struct io_close close;
> [0] struct io_files_update files_update;
> [0] struct io_fadvise fadvise;
> [0] struct io_madvise madvise;
> [0] struct io_epoll epoll;
> [0] struct io_splice splice;
> [0] struct io_provide_buf pbuf;
> };
> [64] struct io_async_ctx *io;
> [72] int cflags;
> [76] u8 opcode;
> [78] u16 buf_index;
> [80] struct io_ring_ctx *ctx;
> [88] struct list_head list;
> [104] unsigned int flags;
> [108] refcount_t refs;
> [112] struct task_struct *task;
> [120] unsigned long fsize;
> [128] u64 user_data;
> [136] u32 result;
> [140] u32 sequence;
> [144] struct list_head link_list;
> [160] struct list_head inflight_entry;
> [176] struct percpu_ref *fixed_file_refs;
> union {
> struct {
> [184] struct callback_head task_work;
> [200] struct hlist_node hash_node;
> [216] struct async_poll *apoll;
> };
> [184] struct io_wq_work work;
> };
> }
> SIZE: 240
>
> struct io_wq_work {
> [0] struct io_wq_work_node list;
> [8] void (*func)(struct io_wq_work **);
> [16] struct files_struct *files;
> [24] struct mm_struct *mm;
> [32] const struct cred *creds;
> [40] struct fs_struct *fs;
> [48] unsigned int flags;
> [52] pid_t task_pid;
> }
> SIZE: 56
>
> The risk mainly comes from the union:
> union {
> /*
> * Only commands that never go async can use the below fields,
> * obviously. Right now only IORING_OP_POLL_ADD uses them, and
> * async armed poll handlers for regular commands. The latter
> * restore the work, if needed.
> */
> struct {
> struct callback_head task_work;
> struct hlist_node hash_node;
> struct async_poll *apoll;
> };
> struct io_wq_work work;
> };
>
> 1, apoll and creds are in same memory offset, for 'async armed poll handlers' case,
> apoll will be used, that means creds will be overwrited. In patch "io_uring: avoid
> unnecessary io_wq_work copy for fast poll feature", I use REQ_F_WORK_INITIALIZED
> to control whether to do io_wq_work restore, then your below codes will break:
>
> static inline void io_req_work_drop_env(struct io_kiocb *req)
> {
> /* always init'ed, put before REQ_F_WORK_INITIALIZED check */
> if (req->work.creds) {
> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> Here req->work.creds will be invalid, or I still need to use some space
> to record original req->work.creds, and do creds restore.
>
> put_cred(req->work.creds);
> req->work.creds = NULL;
> }
> if (!(req->flags & REQ_F_WORK_INITIALIZED))
> return;
>
> 2, For IORING_OP_POLL_ADD case, current mainline codes will use task_work and hash_node,
> 32 bytes, that means io_wq_work's member list, func, files and mm would be overwrited,
> but will not touch creds, it's safe now. But if we will add some new member to
> struct {
> struct callback_head task_work;
> struct hlist_node hash_node;
> struct async_poll *apoll;
> };
> say callback_head adds a new member, our check will still break.
>
> 3. IMO, io_wq_work is just to describe needed running environment for reqs that will be
> punted to io-wq, for reqs submitted and completed inline should not touch this struct
> from software design view, and current io_kiocb is 240 bytes, and a new pointer will be
> 248 bytes, still 4 cache lines for cache line 64 bytes.
I'd say let's do the extra creds member for now. It's safer. We can always look into
shrinking down the size and re-using the io_wq_work part. I'd rather get this
prepped up and ready for 5.8.
Are you sending out a v4 today?
--
Jens Axboe
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v3 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline
2020-05-30 13:36 ` Xiaoguang Wang
2020-05-30 13:43 ` Jens Axboe
@ 2020-05-30 19:38 ` Pavel Begunkov
1 sibling, 0 replies; 9+ messages in thread
From: Pavel Begunkov @ 2020-05-30 19:38 UTC (permalink / raw)
To: Xiaoguang Wang, io-uring; +Cc: axboe, joseph.qi
On 30/05/2020 16:36, Xiaoguang Wang wrote:
>
> The risk mainly comes from the union:
> union {
> /*
> * Only commands that never go async can use the below fields,
> * obviously. Right now only IORING_OP_POLL_ADD uses them, and
> * async armed poll handlers for regular commands. The latter
> * restore the work, if needed.
> */
> struct {
> struct callback_head task_work;
> struct hlist_node hash_node;
> struct async_poll *apoll;
> };
> struct io_wq_work work;
> };
>
> 1, apoll and creds are in same memory offset, for 'async armed poll handlers' case,
> apoll will be used, that means creds will be overwrited. In patch "io_uring: avoid
> unnecessary io_wq_work copy for fast poll feature", I use REQ_F_WORK_INITIALIZED
> to control whether to do io_wq_work restore, then your below codes will break:
Yes, that's an example, which doesn't even consider the second patch. But great
that you anticipated the error. Unconditional partial copy/init probably would
solve the issue, but let's keep it aside for now.
>
> static inline void io_req_work_drop_env(struct io_kiocb *req)
> {
> /* always init'ed, put before REQ_F_WORK_INITIALIZED check */
> if (req->work.creds) {
> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> Here req->work.creds will be invalid, or I still need to use some space
> to record original req->work.creds, and do creds restore.
>
> put_cred(req->work.creds);
> req->work.creds = NULL;
> }
> if (!(req->flags & REQ_F_WORK_INITIALIZED))
> return;
>
> 2, For IORING_OP_POLL_ADD case, current mainline codes will use task_work and
> hash_node,
> 32 bytes, that means io_wq_work's member list, func, files and mm would be
> overwrited,
> but will not touch creds, it's safe now. But if we will add some new member to
> struct {
> struct callback_head task_work;
> struct hlist_node hash_node;
> struct async_poll *apoll;
> };
> say callback_head adds a new member, our check will still break.
>
> 3. IMO, io_wq_work is just to describe needed running environment for reqs that
> will be
> punted to io-wq, for reqs submitted and completed inline should not touch this
> struct
> from software design view, and current io_kiocb is 240 bytes, and a new pointer
Instead, it stores an entity in 2 different places, adding yet another
thing/state to keep in mind. Arguable. I'd rather say -- neither one is better.
And that's why I like the simplicity of initialising it always.
> will be
> 248 bytes, still 4 cache lines for cache line 64 bytes.
>
>
--
Pavel Begunkov
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2020-05-30 19:40 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2020-05-28 9:15 [PATCH v3 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline Xiaoguang Wang
2020-05-28 9:15 ` [PATCH v3 2/2] io_uring: avoid unnecessary io_wq_work copy for fast poll feature Xiaoguang Wang
2020-05-28 17:03 ` [PATCH v3 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline Jens Axboe
2020-05-29 3:58 ` Xiaoguang Wang
2020-05-29 8:58 ` Pavel Begunkov
2020-05-29 14:27 ` Jens Axboe
2020-05-30 13:36 ` Xiaoguang Wang
2020-05-30 13:43 ` Jens Axboe
2020-05-30 19:38 ` Pavel Begunkov
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox