public inbox for [email protected]
 help / color / mirror / Atom feed
* [PATCH v3 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline
@ 2020-05-28  9:15 Xiaoguang Wang
  2020-05-28  9:15 ` [PATCH v3 2/2] io_uring: avoid unnecessary io_wq_work copy for fast poll feature Xiaoguang Wang
                   ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: Xiaoguang Wang @ 2020-05-28  9:15 UTC (permalink / raw)
  To: io-uring; +Cc: axboe, asml.silence, joseph.qi, Xiaoguang Wang

If requests can be submitted and completed inline, we don't need to
initialize whole io_wq_work in io_init_req(), which is an expensive
operation, add a new 'REQ_F_WORK_INITIALIZED' to control whether
io_wq_work is initialized.

I use /dev/nullb0 to evaluate performance improvement in my physical
machine:
  modprobe null_blk nr_devices=1 completion_nsec=0
  sudo taskset -c 60 fio  -name=fiotest -filename=/dev/nullb0 -iodepth=128
  -thread -rw=read -ioengine=io_uring -direct=1 -bs=4k -size=100G -numjobs=1
  -time_based -runtime=120

before this patch:
Run status group 0 (all jobs):
   READ: bw=724MiB/s (759MB/s), 724MiB/s-724MiB/s (759MB/s-759MB/s),
   io=84.8GiB (91.1GB), run=120001-120001msec

With this patch:
Run status group 0 (all jobs):
   READ: bw=761MiB/s (798MB/s), 761MiB/s-761MiB/s (798MB/s-798MB/s),
   io=89.2GiB (95.8GB), run=120001-120001msec

About 5% improvement.

Signed-off-by: Xiaoguang Wang <[email protected]>
---
 fs/io-wq.h    |  5 ----
 fs/io_uring.c | 78 ++++++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 62 insertions(+), 21 deletions(-)

diff --git a/fs/io-wq.h b/fs/io-wq.h
index 5ba12de7572f..3d85d365d764 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -94,11 +94,6 @@ struct io_wq_work {
 	pid_t task_pid;
 };
 
-#define INIT_IO_WORK(work, _func)				\
-	do {							\
-		*(work) = (struct io_wq_work){ .func = _func };	\
-	} while (0)						\
-
 static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
 {
 	if (!work->list.next)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 2af87f73848e..7ba8590a45a6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -535,6 +535,7 @@ enum {
 	REQ_F_POLLED_BIT,
 	REQ_F_BUFFER_SELECTED_BIT,
 	REQ_F_NO_FILE_TABLE_BIT,
+	REQ_F_WORK_INITIALIZED_BIT,
 
 	/* not a real bit, just to check we're not overflowing the space */
 	__REQ_F_LAST_BIT,
@@ -590,6 +591,8 @@ enum {
 	REQ_F_BUFFER_SELECTED	= BIT(REQ_F_BUFFER_SELECTED_BIT),
 	/* doesn't need file table for this request */
 	REQ_F_NO_FILE_TABLE	= BIT(REQ_F_NO_FILE_TABLE_BIT),
+	/* io_wq_work is initialized */
+	REQ_F_WORK_INITIALIZED	= BIT(REQ_F_WORK_INITIALIZED_BIT),
 };
 
 struct async_poll {
@@ -635,6 +638,7 @@ struct io_kiocb {
 	unsigned int		flags;
 	refcount_t		refs;
 	struct task_struct	*task;
+	const struct cred	*creds;
 	unsigned long		fsize;
 	u64			user_data;
 	u32			result;
@@ -882,6 +886,12 @@ static struct kmem_cache *req_cachep;
 
 static const struct file_operations io_uring_fops;
 
+static inline void init_io_work(struct io_kiocb *req,
+			void (*func)(struct io_wq_work **))
+{
+	req->work = (struct io_wq_work){ .func = func };
+	req->flags |= REQ_F_WORK_INITIALIZED;
+}
 struct sock *io_uring_get_socket(struct file *file)
 {
 #if defined(CONFIG_UNIX)
@@ -1035,8 +1045,15 @@ static inline void io_req_work_grab_env(struct io_kiocb *req,
 		mmgrab(current->mm);
 		req->work.mm = current->mm;
 	}
-	if (!req->work.creds)
-		req->work.creds = get_current_cred();
+
+	if (!req->work.creds) {
+		if (!req->creds)
+			req->work.creds = get_current_cred();
+		else {
+			req->work.creds = req->creds;
+			req->creds = NULL;
+		}
+	}
 	if (!req->work.fs && def->needs_fs) {
 		spin_lock(&current->fs->lock);
 		if (!current->fs->in_exec) {
@@ -1053,6 +1070,9 @@ static inline void io_req_work_grab_env(struct io_kiocb *req,
 
 static inline void io_req_work_drop_env(struct io_kiocb *req)
 {
+	if (!(req->flags & REQ_F_WORK_INITIALIZED))
+		return;
+
 	if (req->work.mm) {
 		mmdrop(req->work.mm);
 		req->work.mm = NULL;
@@ -2923,7 +2943,10 @@ static int io_fsync(struct io_kiocb *req, bool force_nonblock)
 {
 	/* fsync always requires a blocking context */
 	if (force_nonblock) {
-		req->work.func = io_fsync_finish;
+		if (!(req->flags & REQ_F_WORK_INITIALIZED))
+			init_io_work(req, io_fsync_finish);
+		else
+			req->work.func = io_fsync_finish;
 		return -EAGAIN;
 	}
 	__io_fsync(req);
@@ -2971,7 +2994,10 @@ static int io_fallocate(struct io_kiocb *req, bool force_nonblock)
 {
 	/* fallocate always requiring blocking context */
 	if (force_nonblock) {
-		req->work.func = io_fallocate_finish;
+		if (!(req->flags & REQ_F_WORK_INITIALIZED))
+			init_io_work(req, io_fallocate_finish);
+		else
+			req->work.func = io_fallocate_finish;
 		return -EAGAIN;
 	}
 
@@ -3500,7 +3526,10 @@ static int io_close(struct io_kiocb *req, bool force_nonblock)
 		/* submission ref will be dropped, take it for async */
 		refcount_inc(&req->refs);
 
-		req->work.func = io_close_finish;
+		if (!(req->flags & REQ_F_WORK_INITIALIZED))
+			init_io_work(req, io_close_finish);
+		else
+			req->work.func = io_close_finish;
 		/*
 		 * Do manual async queue here to avoid grabbing files - we don't
 		 * need the files, and it'll cause io_close_finish() to close
@@ -3563,7 +3592,10 @@ static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
 {
 	/* sync_file_range always requires a blocking context */
 	if (force_nonblock) {
-		req->work.func = io_sync_file_range_finish;
+		if (!(req->flags & REQ_F_WORK_INITIALIZED))
+			init_io_work(req, io_sync_file_range_finish);
+		else
+			req->work.func = io_sync_file_range_finish;
 		return -EAGAIN;
 	}
 
@@ -4032,7 +4064,10 @@ static int io_accept(struct io_kiocb *req, bool force_nonblock)
 
 	ret = __io_accept(req, force_nonblock);
 	if (ret == -EAGAIN && force_nonblock) {
-		req->work.func = io_accept_finish;
+		if (!(req->flags & REQ_F_WORK_INITIALIZED))
+			init_io_work(req, io_accept_finish);
+		else
+			req->work.func = io_accept_finish;
 		return -EAGAIN;
 	}
 	return 0;
@@ -5032,6 +5067,9 @@ static int io_req_defer_prep(struct io_kiocb *req,
 	if (!sqe)
 		return 0;
 
+	if (!(req->flags & REQ_F_WORK_INITIALIZED))
+		init_io_work(req, io_wq_submit_work);
+
 	if (io_op_defs[req->opcode].file_table) {
 		ret = io_grab_files(req);
 		if (unlikely(ret))
@@ -5667,19 +5705,24 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_kiocb *linked_timeout;
 	struct io_kiocb *nxt;
-	const struct cred *old_creds = NULL;
+	const struct cred *creds, *old_creds = NULL;
 	int ret;
 
 again:
 	linked_timeout = io_prep_linked_timeout(req);
 
-	if (req->work.creds && req->work.creds != current_cred()) {
+	if (req->flags & REQ_F_WORK_INITIALIZED)
+		creds = req->work.creds;
+	else
+		creds = req->creds;
+
+	if (creds && creds != current_cred()) {
 		if (old_creds)
 			revert_creds(old_creds);
-		if (old_creds == req->work.creds)
+		if (old_creds == creds)
 			old_creds = NULL; /* restored original creds */
 		else
-			old_creds = override_creds(req->work.creds);
+			old_creds = override_creds(creds);
 	}
 
 	ret = io_issue_sqe(req, sqe, true);
@@ -5696,6 +5739,9 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 			goto exit;
 		}
 punt:
+		if (!(req->flags & REQ_F_WORK_INITIALIZED))
+			init_io_work(req, io_wq_submit_work);
+
 		if (io_op_defs[req->opcode].file_table) {
 			ret = io_grab_files(req);
 			if (ret)
@@ -5948,7 +5994,6 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	refcount_set(&req->refs, 2);
 	req->task = NULL;
 	req->result = 0;
-	INIT_IO_WORK(&req->work, io_wq_submit_work);
 
 	if (unlikely(req->opcode >= IORING_OP_LAST))
 		return -EINVAL;
@@ -5970,11 +6015,12 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 
 	id = READ_ONCE(sqe->personality);
 	if (id) {
-		req->work.creds = idr_find(&ctx->personality_idr, id);
-		if (unlikely(!req->work.creds))
+		req->creds = idr_find(&ctx->personality_idr, id);
+		if (unlikely(!req->creds))
 			return -EINVAL;
-		get_cred(req->work.creds);
-	}
+		get_cred(req->creds);
+	} else
+		req->creds = NULL;
 
 	/* same numerical values with corresponding REQ_F_*, safe to copy */
 	req->flags |= sqe_flags;
-- 
2.17.2


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH v3 2/2] io_uring: avoid unnecessary io_wq_work copy for fast poll feature
  2020-05-28  9:15 [PATCH v3 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline Xiaoguang Wang
@ 2020-05-28  9:15 ` Xiaoguang Wang
  2020-05-28 17:03 ` [PATCH v3 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline Jens Axboe
  2020-05-29  8:58 ` Pavel Begunkov
  2 siblings, 0 replies; 9+ messages in thread
From: Xiaoguang Wang @ 2020-05-28  9:15 UTC (permalink / raw)
  To: io-uring; +Cc: axboe, asml.silence, joseph.qi, Xiaoguang Wang

Basically IORING_OP_POLL_ADD command and async armed poll handlers
for regular commands don't touch io_wq_work, so only REQ_F_WORK_INITIALIZED
is set, can we do io_wq_work copy and restore.

Signed-off-by: Xiaoguang Wang <[email protected]>

---
V3:
  drop the REQ_F_WORK_NEED_RESTORE flag introduced in V2 patch, just
  use REQ_F_WORK_INITIALIZED to control whether to do io_wq_work copy
  and restore.
---
 fs/io_uring.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 7ba8590a45a6..cade241568c6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4411,7 +4411,8 @@ static void io_async_task_func(struct callback_head *cb)
 	spin_unlock_irq(&ctx->completion_lock);
 
 	/* restore ->work in case we need to retry again */
-	memcpy(&req->work, &apoll->work, sizeof(req->work));
+	if (req->flags & REQ_F_WORK_INITIALIZED)
+		memcpy(&req->work, &apoll->work, sizeof(req->work));
 	kfree(apoll);
 
 	if (!canceled) {
@@ -4508,7 +4509,8 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
 		return false;
 
 	req->flags |= REQ_F_POLLED;
-	memcpy(&apoll->work, &req->work, sizeof(req->work));
+	if (req->flags & REQ_F_WORK_INITIALIZED)
+		memcpy(&apoll->work, &req->work, sizeof(req->work));
 	had_io = req->io != NULL;
 
 	get_task_struct(current);
@@ -4533,7 +4535,8 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
 		if (!had_io)
 			io_poll_remove_double(req);
 		spin_unlock_irq(&ctx->completion_lock);
-		memcpy(&req->work, &apoll->work, sizeof(req->work));
+		if (req->flags & REQ_F_WORK_INITIALIZED)
+			memcpy(&req->work, &apoll->work, sizeof(req->work));
 		kfree(apoll);
 		return false;
 	}
@@ -4578,7 +4581,9 @@ static bool io_poll_remove_one(struct io_kiocb *req)
 			 * io_req_work_drop_env below when dropping the
 			 * final reference.
 			 */
-			memcpy(&req->work, &apoll->work, sizeof(req->work));
+			if (req->flags & REQ_F_WORK_INITIALIZED)
+				memcpy(&req->work, &apoll->work,
+				       sizeof(req->work));
 			kfree(apoll);
 		}
 	}
-- 
2.17.2


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH v3 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline
  2020-05-28  9:15 [PATCH v3 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline Xiaoguang Wang
  2020-05-28  9:15 ` [PATCH v3 2/2] io_uring: avoid unnecessary io_wq_work copy for fast poll feature Xiaoguang Wang
@ 2020-05-28 17:03 ` Jens Axboe
  2020-05-29  3:58   ` Xiaoguang Wang
  2020-05-29  8:58 ` Pavel Begunkov
  2 siblings, 1 reply; 9+ messages in thread
From: Jens Axboe @ 2020-05-28 17:03 UTC (permalink / raw)
  To: Xiaoguang Wang, io-uring; +Cc: asml.silence, joseph.qi

On 5/28/20 3:15 AM, Xiaoguang Wang wrote:
> If requests can be submitted and completed inline, we don't need to
> initialize whole io_wq_work in io_init_req(), which is an expensive
> operation, add a new 'REQ_F_WORK_INITIALIZED' to control whether
> io_wq_work is initialized.
> 
> I use /dev/nullb0 to evaluate performance improvement in my physical
> machine:
>   modprobe null_blk nr_devices=1 completion_nsec=0
>   sudo taskset -c 60 fio  -name=fiotest -filename=/dev/nullb0 -iodepth=128
>   -thread -rw=read -ioengine=io_uring -direct=1 -bs=4k -size=100G -numjobs=1
>   -time_based -runtime=120
> 
> before this patch:
> Run status group 0 (all jobs):
>    READ: bw=724MiB/s (759MB/s), 724MiB/s-724MiB/s (759MB/s-759MB/s),
>    io=84.8GiB (91.1GB), run=120001-120001msec
> 
> With this patch:
> Run status group 0 (all jobs):
>    READ: bw=761MiB/s (798MB/s), 761MiB/s-761MiB/s (798MB/s-798MB/s),
>    io=89.2GiB (95.8GB), run=120001-120001msec
> 
> About 5% improvement.

I think this is a big enough of a win to warrant looking closer
at this. Just a quick comment from me so far:

> @@ -2923,7 +2943,10 @@ static int io_fsync(struct io_kiocb *req, bool force_nonblock)
>  {
>  	/* fsync always requires a blocking context */
>  	if (force_nonblock) {
> -		req->work.func = io_fsync_finish;
> +		if (!(req->flags & REQ_F_WORK_INITIALIZED))
> +			init_io_work(req, io_fsync_finish);
> +		else
> +			req->work.func = io_fsync_finish;

This pattern is repeated enough to warrant a helper, ala:

static void io_req_init_async(req, func)
{
	if (req->flags & REQ_F_WORK_INITIALIZED)
		req->work.func = func;
	else
		init_io_work(req, func);
}

also swapped the conditions, I tend to find it easier to read without
the negation.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v3 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline
  2020-05-28 17:03 ` [PATCH v3 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline Jens Axboe
@ 2020-05-29  3:58   ` Xiaoguang Wang
  0 siblings, 0 replies; 9+ messages in thread
From: Xiaoguang Wang @ 2020-05-29  3:58 UTC (permalink / raw)
  To: Jens Axboe, io-uring; +Cc: asml.silence, joseph.qi

hi,

> On 5/28/20 3:15 AM, Xiaoguang Wang wrote:
>> If requests can be submitted and completed inline, we don't need to
>> initialize whole io_wq_work in io_init_req(), which is an expensive
>> operation, add a new 'REQ_F_WORK_INITIALIZED' to control whether
>> io_wq_work is initialized.
>>
>> I use /dev/nullb0 to evaluate performance improvement in my physical
>> machine:
>>    modprobe null_blk nr_devices=1 completion_nsec=0
>>    sudo taskset -c 60 fio  -name=fiotest -filename=/dev/nullb0 -iodepth=128
>>    -thread -rw=read -ioengine=io_uring -direct=1 -bs=4k -size=100G -numjobs=1
>>    -time_based -runtime=120
>>
>> before this patch:
>> Run status group 0 (all jobs):
>>     READ: bw=724MiB/s (759MB/s), 724MiB/s-724MiB/s (759MB/s-759MB/s),
>>     io=84.8GiB (91.1GB), run=120001-120001msec
>>
>> With this patch:
>> Run status group 0 (all jobs):
>>     READ: bw=761MiB/s (798MB/s), 761MiB/s-761MiB/s (798MB/s-798MB/s),
>>     io=89.2GiB (95.8GB), run=120001-120001msec
>>
>> About 5% improvement.
> 
> I think this is a big enough of a win to warrant looking closer
> at this. Just a quick comment from me so far:
Yeah, to be honest, I did't expect that we get this some obvious improvement.
But I have run multiple rounds of same tests, I always get similar improvement,
if you have some free time, you can have a test :)

> 
>> @@ -2923,7 +2943,10 @@ static int io_fsync(struct io_kiocb *req, bool force_nonblock)
>>   {
>>   	/* fsync always requires a blocking context */
>>   	if (force_nonblock) {
>> -		req->work.func = io_fsync_finish;
>> +		if (!(req->flags & REQ_F_WORK_INITIALIZED))
>> +			init_io_work(req, io_fsync_finish);
>> +		else
>> +			req->work.func = io_fsync_finish;
> 
> This pattern is repeated enough to warrant a helper, ala:
> 
> static void io_req_init_async(req, func)
> {
> 	if (req->flags & REQ_F_WORK_INITIALIZED)
> 		req->work.func = func;
> 	else
> 		init_io_work(req, func);
> }
> 
> also swapped the conditions, I tend to find it easier to read without
> the negation.
Thanks for your suggestions. I'll prepare a V4 soon.

Regards,
Xiaoguang Wang

> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v3 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline
  2020-05-28  9:15 [PATCH v3 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline Xiaoguang Wang
  2020-05-28  9:15 ` [PATCH v3 2/2] io_uring: avoid unnecessary io_wq_work copy for fast poll feature Xiaoguang Wang
  2020-05-28 17:03 ` [PATCH v3 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline Jens Axboe
@ 2020-05-29  8:58 ` Pavel Begunkov
  2020-05-29 14:27   ` Jens Axboe
  2020-05-30 13:36   ` Xiaoguang Wang
  2 siblings, 2 replies; 9+ messages in thread
From: Pavel Begunkov @ 2020-05-29  8:58 UTC (permalink / raw)
  To: Xiaoguang Wang, io-uring; +Cc: axboe, joseph.qi

On 28/05/2020 12:15, Xiaoguang Wang wrote:
> If requests can be submitted and completed inline, we don't need to
> initialize whole io_wq_work in io_init_req(), which is an expensive
> operation, add a new 'REQ_F_WORK_INITIALIZED' to control whether
> io_wq_work is initialized.

It looks nicer. Especially if you'd add a helper as Jens supposed.

The other thing, even though I hate treating a part of the fields differently
from others, I don't like ->creds tossing either.

Did you consider trying using only ->work.creds without adding req->creds? like
in the untested incremental below. init_io_work() there is misleading, should be
somehow played around better.

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 4dd3295d74f6..4086561ce444 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -643,7 +643,6 @@ struct io_kiocb {
 	unsigned int		flags;
 	refcount_t		refs;
 	struct task_struct	*task;
-	const struct cred	*creds;
 	unsigned long		fsize;
 	u64			user_data;
 	u32			result;
@@ -894,8 +893,16 @@ static const struct file_operations io_uring_fops;
 static inline void init_io_work(struct io_kiocb *req,
 			void (*func)(struct io_wq_work **))
 {
-	req->work = (struct io_wq_work){ .func = func };
-	req->flags |= REQ_F_WORK_INITIALIZED;
+	struct io_wq_work *work = &req->work;
+
+	/* work->creds are already initialised by a user */
+	work->list.next = NULL;
+	work->func = func;
+	work->files = NULL;
+	work->mm = NULL;
+	work->fs = NULL;
+	work->flags = REQ_F_WORK_INITIALIZED;
+	work->task_pid = 0;
 }
 struct sock *io_uring_get_socket(struct file *file)
 {
@@ -1019,15 +1026,9 @@ static inline void io_req_work_grab_env(struct io_kiocb *req,
 		mmgrab(current->mm);
 		req->work.mm = current->mm;
 	}
+	if (!req->work.creds)
+		req->work.creds = get_current_cred();

-	if (!req->work.creds) {
-		if (!req->creds)
-			req->work.creds = get_current_cred();
-		else {
-			req->work.creds = req->creds;
-			req->creds = NULL;
-		}
-	}
 	if (!req->work.fs && def->needs_fs) {
 		spin_lock(&current->fs->lock);
 		if (!current->fs->in_exec) {
@@ -1044,6 +1045,12 @@ static inline void io_req_work_grab_env(struct io_kiocb *req,

 static inline void io_req_work_drop_env(struct io_kiocb *req)
 {
+	/* always init'ed, put before REQ_F_WORK_INITIALIZED check */
+	if (req->work.creds) {
+		put_cred(req->work.creds);
+		req->work.creds = NULL;
+	}
+
 	if (!(req->flags & REQ_F_WORK_INITIALIZED))
 		return;

@@ -1051,10 +1058,6 @@ static inline void io_req_work_drop_env(struct io_kiocb *req)
 		mmdrop(req->work.mm);
 		req->work.mm = NULL;
 	}
-	if (req->work.creds) {
-		put_cred(req->work.creds);
-		req->work.creds = NULL;
-	}
 	if (req->work.fs) {
 		struct fs_struct *fs = req->work.fs;

@@ -5901,12 +5904,12 @@ static int io_init_req(struct io_ring_ctx *ctx, struct
io_kiocb *req,

 	id = READ_ONCE(sqe->personality);
 	if (id) {
-		req->creds = idr_find(&ctx->personality_idr, id);
-		if (unlikely(!req->creds))
+		req->work.creds = idr_find(&ctx->personality_idr, id);
+		if (unlikely(!req->work.creds))
 			return -EINVAL;
-		get_cred(req->creds);
+		get_cred(req->work.creds);
 	} else
-		req->creds = NULL;
+		req->work.creds = NULL;

 	/* same numerical values with corresponding REQ_F_*, safe to copy */
 	req->flags |= sqe_flags;

-- 
Pavel Begunkov

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH v3 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline
  2020-05-29  8:58 ` Pavel Begunkov
@ 2020-05-29 14:27   ` Jens Axboe
  2020-05-30 13:36   ` Xiaoguang Wang
  1 sibling, 0 replies; 9+ messages in thread
From: Jens Axboe @ 2020-05-29 14:27 UTC (permalink / raw)
  To: Pavel Begunkov, Xiaoguang Wang, io-uring; +Cc: joseph.qi

On 5/29/20 2:58 AM, Pavel Begunkov wrote:
> On 28/05/2020 12:15, Xiaoguang Wang wrote:
>> If requests can be submitted and completed inline, we don't need to
>> initialize whole io_wq_work in io_init_req(), which is an expensive
>> operation, add a new 'REQ_F_WORK_INITIALIZED' to control whether
>> io_wq_work is initialized.
> 
> It looks nicer. Especially if you'd add a helper as Jens supposed.
> 
> The other thing, even though I hate treating a part of the fields differently
> from others, I don't like ->creds tossing either.
> 
> Did you consider trying using only ->work.creds without adding req->creds? like
> in the untested incremental below. init_io_work() there is misleading, should be
> somehow played around better.

I had that thought too when reading the patchset, would be nice _not_ to have
to add a new creds field.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v3 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline
  2020-05-29  8:58 ` Pavel Begunkov
  2020-05-29 14:27   ` Jens Axboe
@ 2020-05-30 13:36   ` Xiaoguang Wang
  2020-05-30 13:43     ` Jens Axboe
  2020-05-30 19:38     ` Pavel Begunkov
  1 sibling, 2 replies; 9+ messages in thread
From: Xiaoguang Wang @ 2020-05-30 13:36 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring; +Cc: axboe, joseph.qi

hi,

> On 28/05/2020 12:15, Xiaoguang Wang wrote:
>> If requests can be submitted and completed inline, we don't need to
>> initialize whole io_wq_work in io_init_req(), which is an expensive
>> operation, add a new 'REQ_F_WORK_INITIALIZED' to control whether
>> io_wq_work is initialized.
> 
> It looks nicer. Especially if you'd add a helper as Jens supposed.
Sure, I'll add a helper in V4, thanks.

> 
> The other thing, even though I hate treating a part of the fields differently
> from others, I don't like ->creds tossing either.
> 
> Did you consider trying using only ->work.creds without adding req->creds? like
> in the untested incremental below. init_io_work() there is misleading, should be
> somehow played around better.
But if not adding a new req->creds, I think there will be some potential risks.
In current io_uring mainline codes, look at io_kiocb's memory layout
crash> struct -o io_kiocb
struct io_kiocb {
         union {
     [0]     struct file *file;
     [0]     struct io_rw rw;
     [0]     struct io_poll_iocb poll;
     [0]     struct io_accept accept;
     [0]     struct io_sync sync;
     [0]     struct io_cancel cancel;
     [0]     struct io_timeout timeout;
     [0]     struct io_connect connect;
     [0]     struct io_sr_msg sr_msg;
     [0]     struct io_open open;
     [0]     struct io_close close;
     [0]     struct io_files_update files_update;
     [0]     struct io_fadvise fadvise;
     [0]     struct io_madvise madvise;
     [0]     struct io_epoll epoll;
     [0]     struct io_splice splice;
     [0]     struct io_provide_buf pbuf;
         };
    [64] struct io_async_ctx *io;
    [72] int cflags;
    [76] u8 opcode;
    [78] u16 buf_index;
    [80] struct io_ring_ctx *ctx;
    [88] struct list_head list;
   [104] unsigned int flags;
   [108] refcount_t refs;
   [112] struct task_struct *task;
   [120] unsigned long fsize;
   [128] u64 user_data;
   [136] u32 result;
   [140] u32 sequence;
   [144] struct list_head link_list;
   [160] struct list_head inflight_entry;
   [176] struct percpu_ref *fixed_file_refs;
         union {
             struct {
   [184]         struct callback_head task_work;
   [200]         struct hlist_node hash_node;
   [216]         struct async_poll *apoll;
             };
   [184]     struct io_wq_work work;
         };
}
SIZE: 240

struct io_wq_work {
    [0] struct io_wq_work_node list;
    [8] void (*func)(struct io_wq_work **);
   [16] struct files_struct *files;
   [24] struct mm_struct *mm;
   [32] const struct cred *creds;
   [40] struct fs_struct *fs;
   [48] unsigned int flags;
   [52] pid_t task_pid;
}
SIZE: 56

The risk mainly comes from the union:
union {
	/*
	 * Only commands that never go async can use the below fields,
	 * obviously. Right now only IORING_OP_POLL_ADD uses them, and
	 * async armed poll handlers for regular commands. The latter
	 * restore the work, if needed.
	 */
	struct {
		struct callback_head	task_work;
		struct hlist_node	hash_node;
		struct async_poll	*apoll;
	};
	struct io_wq_work	work;
};

1, apoll and creds are in same memory offset, for 'async armed poll handlers' case,
apoll will be used, that means creds will be overwrited. In patch "io_uring: avoid
unnecessary io_wq_work copy for fast poll feature", I use REQ_F_WORK_INITIALIZED
to control whether to do io_wq_work restore, then your below codes will break:

static inline void io_req_work_drop_env(struct io_kiocb *req)
{
	/* always init'ed, put before REQ_F_WORK_INITIALIZED check */
	if (req->work.creds) {
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Here req->work.creds will be invalid, or I still need to use some space
to record original req->work.creds, and do creds restore.

		put_cred(req->work.creds);
		req->work.creds = NULL;
	}
	if (!(req->flags & REQ_F_WORK_INITIALIZED))
  		return;

2, For IORING_OP_POLL_ADD case, current mainline codes will use task_work and hash_node,
32 bytes, that means io_wq_work's member list, func, files and mm would be overwrited,
but will not touch creds, it's safe now. But if we will add some new member to
struct {
	struct callback_head	task_work;
	struct hlist_node	hash_node;
	struct async_poll	*apoll;
};
say callback_head adds a new member, our check will still break.

3. IMO, io_wq_work is just to describe needed running environment for reqs that will be
punted to io-wq, for reqs submitted and completed inline should not touch this struct
from software design view, and current io_kiocb is 240 bytes, and a new pointer will be
248 bytes, still 4 cache lines for cache line 64 bytes.


Regards,
Xiaoguang Wang

> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 4dd3295d74f6..4086561ce444 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -643,7 +643,6 @@ struct io_kiocb {
>   	unsigned int		flags;
>   	refcount_t		refs;
>   	struct task_struct	*task;
> -	const struct cred	*creds;
>   	unsigned long		fsize;
>   	u64			user_data;
>   	u32			result;
> @@ -894,8 +893,16 @@ static const struct file_operations io_uring_fops;
>   static inline void init_io_work(struct io_kiocb *req,
>   			void (*func)(struct io_wq_work **))
>   {
> -	req->work = (struct io_wq_work){ .func = func };
> -	req->flags |= REQ_F_WORK_INITIALIZED;
> +	struct io_wq_work *work = &req->work;
> +
> +	/* work->creds are already initialised by a user */
> +	work->list.next = NULL;
> +	work->func = func;
> +	work->files = NULL;
> +	work->mm = NULL;
> +	work->fs = NULL;
> +	work->flags = REQ_F_WORK_INITIALIZED;
> +	work->task_pid = 0;
>   }
>   struct sock *io_uring_get_socket(struct file *file)
>   {
> @@ -1019,15 +1026,9 @@ static inline void io_req_work_grab_env(struct io_kiocb *req,
>   		mmgrab(current->mm);
>   		req->work.mm = current->mm;
>   	}
> +	if (!req->work.creds)
> +		req->work.creds = get_current_cred();
> 
> -	if (!req->work.creds) {
> -		if (!req->creds)
> -			req->work.creds = get_current_cred();
> -		else {
> -			req->work.creds = req->creds;
> -			req->creds = NULL;
> -		}
> -	}
>   	if (!req->work.fs && def->needs_fs) {
>   		spin_lock(&current->fs->lock);
>   		if (!current->fs->in_exec) {
> @@ -1044,6 +1045,12 @@ static inline void io_req_work_grab_env(struct io_kiocb *req,
> 
>   static inline void io_req_work_drop_env(struct io_kiocb *req)
>   {
> +	/* always init'ed, put before REQ_F_WORK_INITIALIZED check */
> +	if (req->work.creds) {
> +		put_cred(req->work.creds);
> +		req->work.creds = NULL;
> +	}
> +
>   	if (!(req->flags & REQ_F_WORK_INITIALIZED))
>   		return;
> 
> @@ -1051,10 +1058,6 @@ static inline void io_req_work_drop_env(struct io_kiocb *req)
>   		mmdrop(req->work.mm);
>   		req->work.mm = NULL;
>   	}
> -	if (req->work.creds) {
> -		put_cred(req->work.creds);
> -		req->work.creds = NULL;
> -	}
>   	if (req->work.fs) {
>   		struct fs_struct *fs = req->work.fs;
> 
> @@ -5901,12 +5904,12 @@ static int io_init_req(struct io_ring_ctx *ctx, struct
> io_kiocb *req,
> 
>   	id = READ_ONCE(sqe->personality);
>   	if (id) {
> -		req->creds = idr_find(&ctx->personality_idr, id);
> -		if (unlikely(!req->creds))
> +		req->work.creds = idr_find(&ctx->personality_idr, id);
> +		if (unlikely(!req->work.creds))
>   			return -EINVAL;
> -		get_cred(req->creds);
> +		get_cred(req->work.creds);
>   	} else
> -		req->creds = NULL;
> +		req->work.creds = NULL;
> 
>   	/* same numerical values with corresponding REQ_F_*, safe to copy */
>   	req->flags |= sqe_flags;
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v3 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline
  2020-05-30 13:36   ` Xiaoguang Wang
@ 2020-05-30 13:43     ` Jens Axboe
  2020-05-30 19:38     ` Pavel Begunkov
  1 sibling, 0 replies; 9+ messages in thread
From: Jens Axboe @ 2020-05-30 13:43 UTC (permalink / raw)
  To: Xiaoguang Wang, Pavel Begunkov, io-uring; +Cc: joseph.qi

On 5/30/20 7:36 AM, Xiaoguang Wang wrote:
> hi,
> 
>> On 28/05/2020 12:15, Xiaoguang Wang wrote:
>>> If requests can be submitted and completed inline, we don't need to
>>> initialize whole io_wq_work in io_init_req(), which is an expensive
>>> operation, add a new 'REQ_F_WORK_INITIALIZED' to control whether
>>> io_wq_work is initialized.
>>
>> It looks nicer. Especially if you'd add a helper as Jens supposed.
> Sure, I'll add a helper in V4, thanks.
> 
>>
>> The other thing, even though I hate treating a part of the fields differently
>> from others, I don't like ->creds tossing either.
>>
>> Did you consider trying using only ->work.creds without adding req->creds? like
>> in the untested incremental below. init_io_work() there is misleading, should be
>> somehow played around better.
> But if not adding a new req->creds, I think there will be some potential risks.
> In current io_uring mainline codes, look at io_kiocb's memory layout
> crash> struct -o io_kiocb
> struct io_kiocb {
>          union {
>      [0]     struct file *file;
>      [0]     struct io_rw rw;
>      [0]     struct io_poll_iocb poll;
>      [0]     struct io_accept accept;
>      [0]     struct io_sync sync;
>      [0]     struct io_cancel cancel;
>      [0]     struct io_timeout timeout;
>      [0]     struct io_connect connect;
>      [0]     struct io_sr_msg sr_msg;
>      [0]     struct io_open open;
>      [0]     struct io_close close;
>      [0]     struct io_files_update files_update;
>      [0]     struct io_fadvise fadvise;
>      [0]     struct io_madvise madvise;
>      [0]     struct io_epoll epoll;
>      [0]     struct io_splice splice;
>      [0]     struct io_provide_buf pbuf;
>          };
>     [64] struct io_async_ctx *io;
>     [72] int cflags;
>     [76] u8 opcode;
>     [78] u16 buf_index;
>     [80] struct io_ring_ctx *ctx;
>     [88] struct list_head list;
>    [104] unsigned int flags;
>    [108] refcount_t refs;
>    [112] struct task_struct *task;
>    [120] unsigned long fsize;
>    [128] u64 user_data;
>    [136] u32 result;
>    [140] u32 sequence;
>    [144] struct list_head link_list;
>    [160] struct list_head inflight_entry;
>    [176] struct percpu_ref *fixed_file_refs;
>          union {
>              struct {
>    [184]         struct callback_head task_work;
>    [200]         struct hlist_node hash_node;
>    [216]         struct async_poll *apoll;
>              };
>    [184]     struct io_wq_work work;
>          };
> }
> SIZE: 240
> 
> struct io_wq_work {
>     [0] struct io_wq_work_node list;
>     [8] void (*func)(struct io_wq_work **);
>    [16] struct files_struct *files;
>    [24] struct mm_struct *mm;
>    [32] const struct cred *creds;
>    [40] struct fs_struct *fs;
>    [48] unsigned int flags;
>    [52] pid_t task_pid;
> }
> SIZE: 56
> 
> The risk mainly comes from the union:
> union {
> 	/*
> 	 * Only commands that never go async can use the below fields,
> 	 * obviously. Right now only IORING_OP_POLL_ADD uses them, and
> 	 * async armed poll handlers for regular commands. The latter
> 	 * restore the work, if needed.
> 	 */
> 	struct {
> 		struct callback_head	task_work;
> 		struct hlist_node	hash_node;
> 		struct async_poll	*apoll;
> 	};
> 	struct io_wq_work	work;
> };
> 
> 1, apoll and creds are in same memory offset, for 'async armed poll handlers' case,
> apoll will be used, that means creds will be overwrited. In patch "io_uring: avoid
> unnecessary io_wq_work copy for fast poll feature", I use REQ_F_WORK_INITIALIZED
> to control whether to do io_wq_work restore, then your below codes will break:
> 
> static inline void io_req_work_drop_env(struct io_kiocb *req)
> {
> 	/* always init'ed, put before REQ_F_WORK_INITIALIZED check */
> 	if (req->work.creds) {
> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> Here req->work.creds will be invalid, or I still need to use some space
> to record original req->work.creds, and do creds restore.
> 
> 		put_cred(req->work.creds);
> 		req->work.creds = NULL;
> 	}
> 	if (!(req->flags & REQ_F_WORK_INITIALIZED))
>   		return;
> 
> 2, For IORING_OP_POLL_ADD case, current mainline codes will use task_work and hash_node,
> 32 bytes, that means io_wq_work's member list, func, files and mm would be overwrited,
> but will not touch creds, it's safe now. But if we will add some new member to
> struct {
> 	struct callback_head	task_work;
> 	struct hlist_node	hash_node;
> 	struct async_poll	*apoll;
> };
> say callback_head adds a new member, our check will still break.
> 
> 3. IMO, io_wq_work is just to describe needed running environment for reqs that will be
> punted to io-wq, for reqs submitted and completed inline should not touch this struct
> from software design view, and current io_kiocb is 240 bytes, and a new pointer will be
> 248 bytes, still 4 cache lines for cache line 64 bytes.

I'd say let's do the extra creds member for now. It's safer. We can always look into
shrinking down the size and re-using the io_wq_work part. I'd rather get this
prepped up and ready for 5.8.

Are you sending out a v4 today?

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v3 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline
  2020-05-30 13:36   ` Xiaoguang Wang
  2020-05-30 13:43     ` Jens Axboe
@ 2020-05-30 19:38     ` Pavel Begunkov
  1 sibling, 0 replies; 9+ messages in thread
From: Pavel Begunkov @ 2020-05-30 19:38 UTC (permalink / raw)
  To: Xiaoguang Wang, io-uring; +Cc: axboe, joseph.qi

On 30/05/2020 16:36, Xiaoguang Wang wrote:
> 
> The risk mainly comes from the union:
> union {
>     /*
>      * Only commands that never go async can use the below fields,
>      * obviously. Right now only IORING_OP_POLL_ADD uses them, and
>      * async armed poll handlers for regular commands. The latter
>      * restore the work, if needed.
>      */
>     struct {
>         struct callback_head    task_work;
>         struct hlist_node    hash_node;
>         struct async_poll    *apoll;
>     };
>     struct io_wq_work    work;
> };
> 
> 1, apoll and creds are in same memory offset, for 'async armed poll handlers' case,
> apoll will be used, that means creds will be overwrited. In patch "io_uring: avoid
> unnecessary io_wq_work copy for fast poll feature", I use REQ_F_WORK_INITIALIZED
> to control whether to do io_wq_work restore, then your below codes will break:

Yes, that's an example, which doesn't even consider the second patch. But great
that you anticipated the error. Unconditional partial copy/init probably would
solve the issue, but let's keep it aside for now.

> 
> static inline void io_req_work_drop_env(struct io_kiocb *req)
> {
>     /* always init'ed, put before REQ_F_WORK_INITIALIZED check */
>     if (req->work.creds) {
> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> Here req->work.creds will be invalid, or I still need to use some space
> to record original req->work.creds, and do creds restore.
> 
>         put_cred(req->work.creds);
>         req->work.creds = NULL;
>     }
>     if (!(req->flags & REQ_F_WORK_INITIALIZED))
>          return;
> 
> 2, For IORING_OP_POLL_ADD case, current mainline codes will use task_work and
> hash_node,
> 32 bytes, that means io_wq_work's member list, func, files and mm would be
> overwrited,
> but will not touch creds, it's safe now. But if we will add some new member to
> struct {
>     struct callback_head    task_work;
>     struct hlist_node    hash_node;
>     struct async_poll    *apoll;
> };
> say callback_head adds a new member, our check will still break.
> 
> 3. IMO, io_wq_work is just to describe needed running environment for reqs that
> will be
> punted to io-wq, for reqs submitted and completed inline should not touch this
> struct
> from software design view, and current io_kiocb is 240 bytes, and a new pointer

Instead, it stores an entity in 2 different places, adding yet another
thing/state to keep in mind. Arguable. I'd rather say -- neither one is better.
And that's why I like the simplicity of initialising it always.


> will be
> 248 bytes, still 4 cache lines for cache line 64 bytes.
> 
> 

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2020-05-30 19:40 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2020-05-28  9:15 [PATCH v3 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline Xiaoguang Wang
2020-05-28  9:15 ` [PATCH v3 2/2] io_uring: avoid unnecessary io_wq_work copy for fast poll feature Xiaoguang Wang
2020-05-28 17:03 ` [PATCH v3 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline Jens Axboe
2020-05-29  3:58   ` Xiaoguang Wang
2020-05-29  8:58 ` Pavel Begunkov
2020-05-29 14:27   ` Jens Axboe
2020-05-30 13:36   ` Xiaoguang Wang
2020-05-30 13:43     ` Jens Axboe
2020-05-30 19:38     ` Pavel Begunkov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox