[PATCHSET 0/11] io_uring improvements/fixes for 5.5-rc

public inbox for [email protected]
 help / color / mirror / Atom feed

* [PATCHSET 0/11] io_uring improvements/fixes for 5.5-rc
@ 2019-12-10 15:57 Jens Axboe
  2019-12-10 15:57 ` [PATCH 01/11] io_uring: allow unbreakable links Jens Axboe
                   ` (10 more replies)
  0 siblings, 11 replies; 25+ messages in thread
From: Jens Axboe @ 2019-12-10 15:57 UTC (permalink / raw)
  To: io-uring

Here's a list of patches I'm propsing for 5.5. The first 4 are the same
as the v1 posting, except I've made changes to the HARDLINK support
after Pavel's review.

The rest is just little tweaks and fixes. Also find them in my
io_uring-5.5 branch.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH 01/11] io_uring: allow unbreakable links
  2019-12-10 15:57 [PATCHSET 0/11] io_uring improvements/fixes for 5.5-rc Jens Axboe
@ 2019-12-10 15:57 ` Jens Axboe
  2019-12-10 21:10   ` Pavel Begunkov
  2019-12-10 15:57 ` [PATCH 02/11] io-wq: remove worker->wait waitqueue Jens Axboe
                   ` (9 subsequent siblings)
  10 siblings, 1 reply; 25+ messages in thread
From: Jens Axboe @ 2019-12-10 15:57 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe, stable, Pavel Begunkov, 李通洲

Some commands will invariably end in a failure in the sense that the
completion result will be less than zero. One such example is timeouts
that don't have a completion count set, they will always complete with
-ETIME unless cancelled.

For linked commands, we sever links and fail the rest of the chain if
the result is less than zero. Since we have commands where we know that
will happen, add IOSQE_IO_HARDLINK as a stronger link that doesn't sever
regardless of the completion result. Note that the link will still sever
if we fail submitting the parent request, hard links are only resilient
in the presence of completion results for requests that did submit
correctly.

Cc: [email protected] # v5.4
Cc: Pavel Begunkov <[email protected]>
Reported-by: 李通洲 <[email protected]>
Signed-off-by: Jens Axboe <[email protected]>
---
 fs/io_uring.c                 | 96 ++++++++++++++++++++---------------
 include/uapi/linux/io_uring.h |  1 +
 2 files changed, 56 insertions(+), 41 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 405be10da73d..4cda61fe67da 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -377,6 +377,7 @@ struct io_kiocb {
 #define REQ_F_TIMEOUT_NOSEQ	8192	/* no timeout sequence */
 #define REQ_F_INFLIGHT		16384	/* on inflight list */
 #define REQ_F_COMP_LOCKED	32768	/* completion under lock */
+#define REQ_F_HARDLINK		65536	/* doesn't sever on completion < 0 */
 	u64			user_data;
 	u32			result;
 	u32			sequence;
@@ -1292,6 +1293,12 @@ static void kiocb_end_write(struct io_kiocb *req)
 	file_end_write(req->file);
 }
 
+static inline void req_set_fail_links(struct io_kiocb *req)
+{
+	if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
+		req->flags |= REQ_F_FAIL_LINK;
+}
+
 static void io_complete_rw_common(struct kiocb *kiocb, long res)
 {
 	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
@@ -1299,8 +1306,8 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res)
 	if (kiocb->ki_flags & IOCB_WRITE)
 		kiocb_end_write(req);
 
-	if ((req->flags & REQ_F_LINK) && res != req->result)
-		req->flags |= REQ_F_FAIL_LINK;
+	if (res != req->result)
+		req_set_fail_links(req);
 	io_cqring_add_event(req, res);
 }
 
@@ -1330,8 +1337,8 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
 	if (kiocb->ki_flags & IOCB_WRITE)
 		kiocb_end_write(req);
 
-	if ((req->flags & REQ_F_LINK) && res != req->result)
-		req->flags |= REQ_F_FAIL_LINK;
+	if (res != req->result)
+		req_set_fail_links(req);
 	req->result = res;
 	if (res != -EAGAIN)
 		req->flags |= REQ_F_IOPOLL_COMPLETED;
@@ -1956,8 +1963,8 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 				end > 0 ? end : LLONG_MAX,
 				fsync_flags & IORING_FSYNC_DATASYNC);
 
-	if (ret < 0 && (req->flags & REQ_F_LINK))
-		req->flags |= REQ_F_FAIL_LINK;
+	if (ret < 0)
+		req_set_fail_links(req);
 	io_cqring_add_event(req, ret);
 	io_put_req_find_next(req, nxt);
 	return 0;
@@ -2003,8 +2010,8 @@ static int io_sync_file_range(struct io_kiocb *req,
 
 	ret = sync_file_range(req->rw.ki_filp, sqe_off, sqe_len, flags);
 
-	if (ret < 0 && (req->flags & REQ_F_LINK))
-		req->flags |= REQ_F_FAIL_LINK;
+	if (ret < 0)
+		req_set_fail_links(req);
 	io_cqring_add_event(req, ret);
 	io_put_req_find_next(req, nxt);
 	return 0;
@@ -2079,8 +2086,8 @@ static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 
 out:
 	io_cqring_add_event(req, ret);
-	if (ret < 0 && (req->flags & REQ_F_LINK))
-		req->flags |= REQ_F_FAIL_LINK;
+	if (ret < 0)
+		req_set_fail_links(req);
 	io_put_req_find_next(req, nxt);
 	return 0;
 #else
@@ -2161,8 +2168,8 @@ static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 
 out:
 	io_cqring_add_event(req, ret);
-	if (ret < 0 && (req->flags & REQ_F_LINK))
-		req->flags |= REQ_F_FAIL_LINK;
+	if (ret < 0)
+		req_set_fail_links(req);
 	io_put_req_find_next(req, nxt);
 	return 0;
 #else
@@ -2196,8 +2203,8 @@ static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	}
 	if (ret == -ERESTARTSYS)
 		ret = -EINTR;
-	if (ret < 0 && (req->flags & REQ_F_LINK))
-		req->flags |= REQ_F_FAIL_LINK;
+	if (ret < 0)
+		req_set_fail_links(req);
 	io_cqring_add_event(req, ret);
 	io_put_req_find_next(req, nxt);
 	return 0;
@@ -2263,8 +2270,8 @@ static int io_connect(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	if (ret == -ERESTARTSYS)
 		ret = -EINTR;
 out:
-	if (ret < 0 && (req->flags & REQ_F_LINK))
-		req->flags |= REQ_F_FAIL_LINK;
+	if (ret < 0)
+		req_set_fail_links(req);
 	io_cqring_add_event(req, ret);
 	io_put_req_find_next(req, nxt);
 	return 0;
@@ -2340,8 +2347,8 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	spin_unlock_irq(&ctx->completion_lock);
 
 	io_cqring_add_event(req, ret);
-	if (ret < 0 && (req->flags & REQ_F_LINK))
-		req->flags |= REQ_F_FAIL_LINK;
+	if (ret < 0)
+		req_set_fail_links(req);
 	io_put_req(req);
 	return 0;
 }
@@ -2399,8 +2406,8 @@ static void io_poll_complete_work(struct io_wq_work **workptr)
 
 	io_cqring_ev_posted(ctx);
 
-	if (ret < 0 && req->flags & REQ_F_LINK)
-		req->flags |= REQ_F_FAIL_LINK;
+	if (ret < 0)
+		req_set_fail_links(req);
 	io_put_req_find_next(req, &nxt);
 	if (nxt)
 		*workptr = &nxt->work;
@@ -2582,8 +2589,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 
 	io_cqring_ev_posted(ctx);
-	if (req->flags & REQ_F_LINK)
-		req->flags |= REQ_F_FAIL_LINK;
+	req_set_fail_links(req);
 	io_put_req(req);
 	return HRTIMER_NORESTART;
 }
@@ -2608,8 +2614,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
 	if (ret == -1)
 		return -EALREADY;
 
-	if (req->flags & REQ_F_LINK)
-		req->flags |= REQ_F_FAIL_LINK;
+	req_set_fail_links(req);
 	io_cqring_fill_event(req, -ECANCELED);
 	io_put_req(req);
 	return 0;
@@ -2640,8 +2645,8 @@ static int io_timeout_remove(struct io_kiocb *req,
 	io_commit_cqring(ctx);
 	spin_unlock_irq(&ctx->completion_lock);
 	io_cqring_ev_posted(ctx);
-	if (ret < 0 && req->flags & REQ_F_LINK)
-		req->flags |= REQ_F_FAIL_LINK;
+	if (ret < 0)
+		req_set_fail_links(req);
 	io_put_req(req);
 	return 0;
 }
@@ -2655,13 +2660,19 @@ static int io_timeout_prep(struct io_kiocb *req, struct io_async_ctx *io,
 
 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
-	if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
+	if (sqe->ioprio || sqe->buf_index || sqe->len != 1) {
+		printk("1\n");
 		return -EINVAL;
-	if (sqe->off && is_timeout_link)
+	}
+	if (sqe->off && is_timeout_link) {
+		printk("2\n");
 		return -EINVAL;
+	}
 	flags = READ_ONCE(sqe->timeout_flags);
-	if (flags & ~IORING_TIMEOUT_ABS)
+	if (flags & ~IORING_TIMEOUT_ABS) {
+		printk("3\n");
 		return -EINVAL;
+	}
 
 	data = &io->timeout;
 	data->req = req;
@@ -2822,8 +2833,8 @@ static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 	io_cqring_ev_posted(ctx);
 
-	if (ret < 0 && (req->flags & REQ_F_LINK))
-		req->flags |= REQ_F_FAIL_LINK;
+	if (ret < 0)
+		req_set_fail_links(req);
 	io_put_req_find_next(req, nxt);
 }
 
@@ -3044,8 +3055,7 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
 	io_put_req(req);
 
 	if (ret) {
-		if (req->flags & REQ_F_LINK)
-			req->flags |= REQ_F_FAIL_LINK;
+		req_set_fail_links(req);
 		io_cqring_add_event(req, ret);
 		io_put_req(req);
 	}
@@ -3179,8 +3189,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 
 	if (prev) {
-		if (prev->flags & REQ_F_LINK)
-			prev->flags |= REQ_F_FAIL_LINK;
+		req_set_fail_links(prev);
 		io_async_find_and_cancel(ctx, req, prev->user_data, NULL,
 						-ETIME);
 		io_put_req(prev);
@@ -3273,8 +3282,7 @@ static void __io_queue_sqe(struct io_kiocb *req)
 	/* and drop final reference, if we failed */
 	if (ret) {
 		io_cqring_add_event(req, ret);
-		if (req->flags & REQ_F_LINK)
-			req->flags |= REQ_F_FAIL_LINK;
+		req_set_fail_links(req);
 		io_put_req(req);
 	}
 }
@@ -3293,8 +3301,7 @@ static void io_queue_sqe(struct io_kiocb *req)
 	if (ret) {
 		if (ret != -EIOCBQUEUED) {
 			io_cqring_add_event(req, ret);
-			if (req->flags & REQ_F_LINK)
-				req->flags |= REQ_F_FAIL_LINK;
+			req_set_fail_links(req);
 			io_double_put_req(req);
 		}
 	} else
@@ -3311,7 +3318,8 @@ static inline void io_queue_link_head(struct io_kiocb *req)
 }
 
 
-#define SQE_VALID_FLAGS	(IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK)
+#define SQE_VALID_FLAGS	(IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK|	\
+				IOSQE_IO_HARDLINK)
 
 static bool io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state,
 			  struct io_kiocb **link)
@@ -3349,6 +3357,9 @@ static bool io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state,
 		if (req->sqe->flags & IOSQE_IO_DRAIN)
 			(*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN;
 
+		if (req->sqe->flags & IOSQE_IO_HARDLINK)
+			req->flags |= REQ_F_HARDLINK;
+
 		io = kmalloc(sizeof(*io), GFP_KERNEL);
 		if (!io) {
 			ret = -EAGAIN;
@@ -3358,13 +3369,16 @@ static bool io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state,
 		ret = io_req_defer_prep(req, io);
 		if (ret) {
 			kfree(io);
+			/* fail even hard links since we don't submit */
 			prev->flags |= REQ_F_FAIL_LINK;
 			goto err_req;
 		}
 		trace_io_uring_link(ctx, req, prev);
 		list_add_tail(&req->link_list, &prev->link_list);
-	} else if (req->sqe->flags & IOSQE_IO_LINK) {
+	} else if (req->sqe->flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) {
 		req->flags |= REQ_F_LINK;
+		if (req->sqe->flags & IOSQE_IO_HARDLINK)
+			req->flags |= REQ_F_HARDLINK;
 
 		INIT_LIST_HEAD(&req->link_list);
 		*link = req;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index eabccb46edd1..ea231366f5fd 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -48,6 +48,7 @@ struct io_uring_sqe {
 #define IOSQE_FIXED_FILE	(1U << 0)	/* use fixed fileset */
 #define IOSQE_IO_DRAIN		(1U << 1)	/* issue after inflight IO */
 #define IOSQE_IO_LINK		(1U << 2)	/* links next sqe */
+#define IOSQE_IO_HARDLINK	(1U << 3)	/* like LINK, but stronger */
 
 /*
  * io_uring_setup() flags
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH 01/11] io_uring: allow unbreakable links
  2019-12-10 15:57 ` [PATCH 01/11] io_uring: allow unbreakable links Jens Axboe
@ 2019-12-10 21:10   ` Pavel Begunkov
  2019-12-10 21:12     ` Jens Axboe
  0 siblings, 1 reply; 25+ messages in thread
From: Pavel Begunkov @ 2019-12-10 21:10 UTC (permalink / raw)
  To: Jens Axboe, io-uring; +Cc: stable, 李通洲


[-- Attachment #1.1: Type: text/plain, Size: 11463 bytes --]

Apart from debug code (see comments below) io_uring part of
the patchset looks good.

Reviewed-by: Pavel Begunkov <[email protected]>

On 10/12/2019 18:57, Jens Axboe wrote:
> Some commands will invariably end in a failure in the sense that the
> completion result will be less than zero. One such example is timeouts
> that don't have a completion count set, they will always complete with
> -ETIME unless cancelled.
> 
> For linked commands, we sever links and fail the rest of the chain if
> the result is less than zero. Since we have commands where we know that
> will happen, add IOSQE_IO_HARDLINK as a stronger link that doesn't sever
> regardless of the completion result. Note that the link will still sever
> if we fail submitting the parent request, hard links are only resilient
> in the presence of completion results for requests that did submit
> correctly.
> 
> Cc: [email protected] # v5.4
> Cc: Pavel Begunkov <[email protected]>
> Reported-by: 李通洲 <[email protected]>
> Signed-off-by: Jens Axboe <[email protected]>
> ---
>  fs/io_uring.c                 | 96 ++++++++++++++++++++---------------
>  include/uapi/linux/io_uring.h |  1 +
>  2 files changed, 56 insertions(+), 41 deletions(-)
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 405be10da73d..4cda61fe67da 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -377,6 +377,7 @@ struct io_kiocb {
>  #define REQ_F_TIMEOUT_NOSEQ	8192	/* no timeout sequence */
>  #define REQ_F_INFLIGHT		16384	/* on inflight list */
>  #define REQ_F_COMP_LOCKED	32768	/* completion under lock */
> +#define REQ_F_HARDLINK		65536	/* doesn't sever on completion < 0 */
>  	u64			user_data;
>  	u32			result;
>  	u32			sequence;
> @@ -1292,6 +1293,12 @@ static void kiocb_end_write(struct io_kiocb *req)
>  	file_end_write(req->file);
>  }
>  
> +static inline void req_set_fail_links(struct io_kiocb *req)
> +{
> +	if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
> +		req->flags |= REQ_F_FAIL_LINK;
> +}
> +
>  static void io_complete_rw_common(struct kiocb *kiocb, long res)
>  {
>  	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
> @@ -1299,8 +1306,8 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res)
>  	if (kiocb->ki_flags & IOCB_WRITE)
>  		kiocb_end_write(req);
>  
> -	if ((req->flags & REQ_F_LINK) && res != req->result)
> -		req->flags |= REQ_F_FAIL_LINK;
> +	if (res != req->result)
> +		req_set_fail_links(req);
>  	io_cqring_add_event(req, res);
>  }
>  
> @@ -1330,8 +1337,8 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
>  	if (kiocb->ki_flags & IOCB_WRITE)
>  		kiocb_end_write(req);
>  
> -	if ((req->flags & REQ_F_LINK) && res != req->result)
> -		req->flags |= REQ_F_FAIL_LINK;
> +	if (res != req->result)
> +		req_set_fail_links(req);
>  	req->result = res;
>  	if (res != -EAGAIN)
>  		req->flags |= REQ_F_IOPOLL_COMPLETED;
> @@ -1956,8 +1963,8 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
>  				end > 0 ? end : LLONG_MAX,
>  				fsync_flags & IORING_FSYNC_DATASYNC);
>  
> -	if (ret < 0 && (req->flags & REQ_F_LINK))
> -		req->flags |= REQ_F_FAIL_LINK;
> +	if (ret < 0)
> +		req_set_fail_links(req);
>  	io_cqring_add_event(req, ret);
>  	io_put_req_find_next(req, nxt);
>  	return 0;
> @@ -2003,8 +2010,8 @@ static int io_sync_file_range(struct io_kiocb *req,
>  
>  	ret = sync_file_range(req->rw.ki_filp, sqe_off, sqe_len, flags);
>  
> -	if (ret < 0 && (req->flags & REQ_F_LINK))
> -		req->flags |= REQ_F_FAIL_LINK;
> +	if (ret < 0)
> +		req_set_fail_links(req);
>  	io_cqring_add_event(req, ret);
>  	io_put_req_find_next(req, nxt);
>  	return 0;
> @@ -2079,8 +2086,8 @@ static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
>  
>  out:
>  	io_cqring_add_event(req, ret);
> -	if (ret < 0 && (req->flags & REQ_F_LINK))
> -		req->flags |= REQ_F_FAIL_LINK;
> +	if (ret < 0)
> +		req_set_fail_links(req);
>  	io_put_req_find_next(req, nxt);
>  	return 0;
>  #else
> @@ -2161,8 +2168,8 @@ static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
>  
>  out:
>  	io_cqring_add_event(req, ret);
> -	if (ret < 0 && (req->flags & REQ_F_LINK))
> -		req->flags |= REQ_F_FAIL_LINK;
> +	if (ret < 0)
> +		req_set_fail_links(req);
>  	io_put_req_find_next(req, nxt);
>  	return 0;
>  #else
> @@ -2196,8 +2203,8 @@ static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe,
>  	}
>  	if (ret == -ERESTARTSYS)
>  		ret = -EINTR;
> -	if (ret < 0 && (req->flags & REQ_F_LINK))
> -		req->flags |= REQ_F_FAIL_LINK;
> +	if (ret < 0)
> +		req_set_fail_links(req);
>  	io_cqring_add_event(req, ret);
>  	io_put_req_find_next(req, nxt);
>  	return 0;
> @@ -2263,8 +2270,8 @@ static int io_connect(struct io_kiocb *req, const struct io_uring_sqe *sqe,
>  	if (ret == -ERESTARTSYS)
>  		ret = -EINTR;
>  out:
> -	if (ret < 0 && (req->flags & REQ_F_LINK))
> -		req->flags |= REQ_F_FAIL_LINK;
> +	if (ret < 0)
> +		req_set_fail_links(req);
>  	io_cqring_add_event(req, ret);
>  	io_put_req_find_next(req, nxt);this and
>  	return 0;
> @@ -2340,8 +2347,8 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>  	spin_unlock_irq(&ctx->completion_lock);
>  
>  	io_cqring_add_event(req, ret);
> -	if (ret < 0 && (req->flags & REQ_F_LINK))
> -		req->flags |= REQ_F_FAIL_LINK;
> +	if (ret < 0)
> +		req_set_fail_links(req);
>  	io_put_req(req);
>  	return 0;
>  }
> @@ -2399,8 +2406,8 @@ static void io_poll_complete_work(struct io_wq_work **workptr)
>  
>  	io_cqring_ev_posted(ctx);
>  
> -	if (ret < 0 && req->flags & REQ_F_LINK)
> -		req->flags |= REQ_F_FAIL_LINK;
> +	if (ret < 0)
> +		req_set_fail_links(req);
>  	io_put_req_find_next(req, &nxt);
>  	if (nxt)
>  		*workptr = &nxt->work;
> @@ -2582,8 +2589,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
>  	spin_unlock_irqrestore(&ctx->completion_lock, flags);
>  
>  	io_cqring_ev_posted(ctx);
> -	if (req->flags & REQ_F_LINK)
> -		req->flags |= REQ_F_FAIL_LINK;
> +	req_set_fail_links(req);
>  	io_put_req(req);
>  	return HRTIMER_NORESTART;
>  }
> @@ -2608,8 +2614,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
>  	if (ret == -1)
>  		return -EALREADY;
>  
> -	if (req->flags & REQ_F_LINK)
> -		req->flags |= REQ_F_FAIL_LINK;
> +	req_set_fail_links(req);
>  	io_cqring_fill_event(req, -ECANCELED);
>  	io_put_req(req);
>  	return 0;
> @@ -2640,8 +2645,8 @@ static int io_timeout_remove(struct io_kiocb *req,
>  	io_commit_cqring(ctx);
>  	spin_unlock_irq(&ctx->completion_lock);
>  	io_cqring_ev_posted(ctx);
> -	if (ret < 0 && req->flags & REQ_F_LINK)
> -		req->flags |= REQ_F_FAIL_LINK;
> +	if (ret < 0)
> +		req_set_fail_links(req);
>  	io_put_req(req);
>  	return 0;
>  }
> @@ -2655,13 +2660,19 @@ static int io_timeout_prep(struct io_kiocb *req, struct io_async_ctx *io,
>  
>  	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
>  		return -EINVAL;
> -	if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
> +	if (sqe->ioprio || sqe->buf_index || sqe->len != 1) {
> +		printk("1\n");

debug output

>  		return -EINVAL;
> -	if (sqe->off && is_timeout_link)
> +	}
> +	if (sqe->off && is_timeout_link) {
> +		printk("2\n");

same

>  		return -EINVAL;
> +	}
>  	flags = READ_ONCE(sqe->timeout_flags);
> -	if (flags & ~IORING_TIMEOUT_ABS)
> +	if (flags & ~IORING_TIMEOUT_ABS) {
> +		printk("3\n");

same

>  		return -EINVAL;
> +	}
>  
>  	data = &io->timeout;
>  	data->req = req;
> @@ -2822,8 +2833,8 @@ static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
>  	spin_unlock_irqrestore(&ctx->completion_lock, flags);
>  	io_cqring_ev_posted(ctx);
>  
> -	if (ret < 0 && (req->flags & REQ_F_LINK))
> -		req->flags |= REQ_F_FAIL_LINK;
> +	if (ret < 0)
> +		req_set_fail_links(req);
>  	io_put_req_find_next(req, nxt);
>  }
>  
> @@ -3044,8 +3055,7 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
>  	io_put_req(req);
>  
>  	if (ret) {
> -		if (req->flags & REQ_F_LINK)
> -			req->flags |= REQ_F_FAIL_LINK;
> +		req_set_fail_links(req);
>  		io_cqring_add_event(req, ret);
>  		io_put_req(req);
>  	}
> @@ -3179,8 +3189,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
>  	spin_unlock_irqrestore(&ctx->completion_lock, flags);
>  
>  	if (prev) {
> -		if (prev->flags & REQ_F_LINK)
> -			prev->flags |= REQ_F_FAIL_LINK;
> +		req_set_fail_links(prev);
>  		io_async_find_and_cancel(ctx, req, prev->user_data, NULL,
>  						-ETIME);
>  		io_put_req(prev);
> @@ -3273,8 +3282,7 @@ static void __io_queue_sqe(struct io_kiocb *req)
>  	/* and drop final reference, if we failed */
>  	if (ret) {
>  		io_cqring_add_event(req, ret);
> -		if (req->flags & REQ_F_LINK)
> -			req->flags |= REQ_F_FAIL_LINK;
> +		req_set_fail_links(req);
>  		io_put_req(req);
>  	}
>  }
> @@ -3293,8 +3301,7 @@ static void io_queue_sqe(struct io_kiocb *req)
>  	if (ret) {
>  		if (ret != -EIOCBQUEUED) {
>  			io_cqring_add_event(req, ret);
> -			if (req->flags & REQ_F_LINK)
> -				req->flags |= REQ_F_FAIL_LINK;
> +			req_set_fail_links(req);
>  			io_double_put_req(req);
>  		}
>  	} else
> @@ -3311,7 +3318,8 @@ static inline void io_queue_link_head(struct io_kiocb *req)
>  }
>  
>  
> -#define SQE_VALID_FLAGS	(IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK)
> +#define SQE_VALID_FLAGS	(IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK|	\
> +				IOSQE_IO_HARDLINK)
>  
>  static bool io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state,
>  			  struct io_kiocb **link)
> @@ -3349,6 +3357,9 @@ static bool io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state,
>  		if (req->sqe->flags & IOSQE_IO_DRAIN)
>  			(*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN;
>  
> +		if (req->sqe->flags & IOSQE_IO_HARDLINK)
> +			req->flags |= REQ_F_HARDLINK;
> +
>  		io = kmalloc(sizeof(*io), GFP_KERNEL);
>  		if (!io) {
>  			ret = -EAGAIN;
> @@ -3358,13 +3369,16 @@ static bool io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state,
>  		ret = io_req_defer_prep(req, io);
>  		if (ret) {
>  			kfree(io);
> +			/* fail even hard links since we don't submit */
>  			prev->flags |= REQ_F_FAIL_LINK;
>  			goto err_req;
>  		}
>  		trace_io_uring_link(ctx, req, prev);
>  		list_add_tail(&req->link_list, &prev->link_list);
> -	} else if (req->sqe->flags & IOSQE_IO_LINK) {
> +	} else if (req->sqe->flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) {
>  		req->flags |= REQ_F_LINK;
> +		if (req->sqe->flags & IOSQE_IO_HARDLINK)
> +			req->flags |= REQ_F_HARDLINK;
>  
>  		INIT_LIST_HEAD(&req->link_list);
>  		*link = req;
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index eabccb46edd1..ea231366f5fd 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -48,6 +48,7 @@ struct io_uring_sqe {
>  #define IOSQE_FIXED_FILE	(1U << 0)	/* use fixed fileset */
>  #define IOSQE_IO_DRAIN		(1U << 1)	/* issue after inflight IO */
>  #define IOSQE_IO_LINK		(1U << 2)	/* links next sqe */
> +#define IOSQE_IO_HARDLINK	(1U << 3)	/* like LINK, but stronger */
>  
>  /*
>   * io_uring_setup() flags
> 

-- 
Pavel Begunkov


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 01/11] io_uring: allow unbreakable links
  2019-12-10 21:10   ` Pavel Begunkov
@ 2019-12-10 21:12     ` Jens Axboe
  2019-12-10 21:28       ` Pavel Begunkov
  0 siblings, 1 reply; 25+ messages in thread
From: Jens Axboe @ 2019-12-10 21:12 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring; +Cc: stable, 李通洲

On 12/10/19 2:10 PM, Pavel Begunkov wrote:
> Apart from debug code (see comments below) io_uring part of
> the patchset looks good.

Hah, oops!

> Reviewed-by: Pavel Begunkov <[email protected]>

Thanks

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 01/11] io_uring: allow unbreakable links
  2019-12-10 21:12     ` Jens Axboe
@ 2019-12-10 21:28       ` Pavel Begunkov
  2019-12-10 22:17         ` Jens Axboe
  0 siblings, 1 reply; 25+ messages in thread
From: Pavel Begunkov @ 2019-12-10 21:28 UTC (permalink / raw)
  To: Jens Axboe, io-uring; +Cc: stable, 李通洲


[-- Attachment #1.1: Type: text/plain, Size: 585 bytes --]

Remembered my earlier comment. If the intention is to set HARDLINK,
must it go with REQ_F_LINK? And if not, is it ever valid to set
them both?

That's not like there is a preference, but as it affects userspace,
it'd be better to state it clearly and make code to check it.


On 11/12/2019 00:12, Jens Axboe wrote:
> On 12/10/19 2:10 PM, Pavel Begunkov wrote:
>> Apart from debug code (see comments below) io_uring part of
>> the patchset looks good.
> 
> Hah, oops!
> 
>> Reviewed-by: Pavel Begunkov <[email protected]>
> 
> Thanks
> 

-- 
Pavel Begunkov


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 01/11] io_uring: allow unbreakable links
  2019-12-10 21:28       ` Pavel Begunkov
@ 2019-12-10 22:17         ` Jens Axboe
  0 siblings, 0 replies; 25+ messages in thread
From: Jens Axboe @ 2019-12-10 22:17 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring; +Cc: stable, 李通洲

On 12/10/19 2:28 PM, Pavel Begunkov wrote:
> Remembered my earlier comment. If the intention is to set HARDLINK,
> must it go with REQ_F_LINK? And if not, is it ever valid to set
> them both?

IOSQE_IO_HARDLINK implies IOSQE_IO_LINK, it's the same behavior, just
doesn't sever if we get a negative completion result.

> That's not like there is a preference, but as it affects userspace,
> it'd be better to state it clearly and make code to check it.

Agree!

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH 02/11] io-wq: remove worker->wait waitqueue
  2019-12-10 15:57 [PATCHSET 0/11] io_uring improvements/fixes for 5.5-rc Jens Axboe
  2019-12-10 15:57 ` [PATCH 01/11] io_uring: allow unbreakable links Jens Axboe
@ 2019-12-10 15:57 ` Jens Axboe
  2019-12-10 15:57 ` [PATCH 03/11] io-wq: briefly spin for new work after finishing work Jens Axboe
                   ` (8 subsequent siblings)
  10 siblings, 0 replies; 25+ messages in thread
From: Jens Axboe @ 2019-12-10 15:57 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

We only have one cases of using the waitqueue to wake the worker, the
rest are using wake_up_process(). Since we can save some cycles not
fiddling with the waitqueue io_wqe_worker(), switch the work activation
to task wakeup and get rid of the now unused wait_queue_head_t in
struct io_worker.

Signed-off-by: Jens Axboe <[email protected]>
---
 fs/io-wq.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 74b40506c5d9..6b663ddceb42 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -49,7 +49,6 @@ struct io_worker {
 	struct hlist_nulls_node nulls_node;
 	struct list_head all_list;
 	struct task_struct *task;
-	wait_queue_head_t wait;
 	struct io_wqe *wqe;
 
 	struct io_wq_work *cur_work;
@@ -258,7 +257,7 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe)
 
 	worker = hlist_nulls_entry(n, struct io_worker, nulls_node);
 	if (io_worker_get(worker)) {
-		wake_up(&worker->wait);
+		wake_up_process(worker->task);
 		io_worker_release(worker);
 		return true;
 	}
@@ -497,13 +496,11 @@ static int io_wqe_worker(void *data)
 	struct io_worker *worker = data;
 	struct io_wqe *wqe = worker->wqe;
 	struct io_wq *wq = wqe->wq;
-	DEFINE_WAIT(wait);
 
 	io_worker_start(wqe, worker);
 
 	while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
-		prepare_to_wait(&worker->wait, &wait, TASK_INTERRUPTIBLE);
-
+		set_current_state(TASK_INTERRUPTIBLE);
 		spin_lock_irq(&wqe->lock);
 		if (io_wqe_run_queue(wqe)) {
 			__set_current_state(TASK_RUNNING);
@@ -526,8 +523,6 @@ static int io_wqe_worker(void *data)
 			break;
 	}
 
-	finish_wait(&worker->wait, &wait);
-
 	if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
 		spin_lock_irq(&wqe->lock);
 		if (!wq_list_empty(&wqe->work_list))
@@ -589,7 +584,6 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
 
 	refcount_set(&worker->ref, 1);
 	worker->nulls_node.pprev = NULL;
-	init_waitqueue_head(&worker->wait);
 	worker->wqe = wqe;
 	spin_lock_init(&worker->lock);
 
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 03/11] io-wq: briefly spin for new work after finishing work
  2019-12-10 15:57 [PATCHSET 0/11] io_uring improvements/fixes for 5.5-rc Jens Axboe
  2019-12-10 15:57 ` [PATCH 01/11] io_uring: allow unbreakable links Jens Axboe
  2019-12-10 15:57 ` [PATCH 02/11] io-wq: remove worker->wait waitqueue Jens Axboe
@ 2019-12-10 15:57 ` Jens Axboe
  2019-12-10 15:57 ` [PATCH 04/11] io_uring: sqthread should grab ctx->uring_lock for submissions Jens Axboe
                   ` (7 subsequent siblings)
  10 siblings, 0 replies; 25+ messages in thread
From: Jens Axboe @ 2019-12-10 15:57 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

To avoid going to sleep only to get woken shortly thereafter, spin
briefly for new work upon completion of work.

Signed-off-by: Jens Axboe <[email protected]>
---
 fs/io-wq.c | 24 ++++++++++++++++++++++--
 fs/io-wq.h |  7 ++++---
 2 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 6b663ddceb42..90c4978781fb 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -491,26 +491,46 @@ static void io_worker_handle_work(struct io_worker *worker)
 	} while (1);
 }
 
+static inline void io_worker_spin_for_work(struct io_wqe *wqe)
+{
+	int i = 0;
+
+	while (++i < 1000) {
+		if (io_wqe_run_queue(wqe))
+			break;
+		if (need_resched())
+			break;
+		cpu_relax();
+	}
+}
+
 static int io_wqe_worker(void *data)
 {
 	struct io_worker *worker = data;
 	struct io_wqe *wqe = worker->wqe;
 	struct io_wq *wq = wqe->wq;
+	bool did_work;
 
 	io_worker_start(wqe, worker);
 
+	did_work = false;
 	while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
 		set_current_state(TASK_INTERRUPTIBLE);
+loop:
+		if (did_work)
+			io_worker_spin_for_work(wqe);
 		spin_lock_irq(&wqe->lock);
 		if (io_wqe_run_queue(wqe)) {
 			__set_current_state(TASK_RUNNING);
 			io_worker_handle_work(worker);
-			continue;
+			did_work = true;
+			goto loop;
 		}
+		did_work = false;
 		/* drops the lock on success, retry */
 		if (__io_worker_idle(wqe, worker)) {
 			__release(&wqe->lock);
-			continue;
+			goto loop;
 		}
 		spin_unlock_irq(&wqe->lock);
 		if (signal_pending(current))
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 7c333a28e2a7..fb993b2bd0ef 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -35,7 +35,8 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node,
 				    struct io_wq_work_list *list)
 {
 	if (!list->first) {
-		list->first = list->last = node;
+		list->last = node;
+		WRITE_ONCE(list->first, node);
 	} else {
 		list->last->next = node;
 		list->last = node;
@@ -47,7 +48,7 @@ static inline void wq_node_del(struct io_wq_work_list *list,
 			       struct io_wq_work_node *prev)
 {
 	if (node == list->first)
-		list->first = node->next;
+		WRITE_ONCE(list->first, node->next);
 	if (node == list->last)
 		list->last = prev;
 	if (prev)
@@ -58,7 +59,7 @@ static inline void wq_node_del(struct io_wq_work_list *list,
 #define wq_list_for_each(pos, prv, head)			\
 	for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)
 
-#define wq_list_empty(list)	((list)->first == NULL)
+#define wq_list_empty(list)	(READ_ONCE((list)->first) == NULL)
 #define INIT_WQ_LIST(list)	do {				\
 	(list)->first = NULL;					\
 	(list)->last = NULL;					\
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 04/11] io_uring: sqthread should grab ctx->uring_lock for submissions
  2019-12-10 15:57 [PATCHSET 0/11] io_uring improvements/fixes for 5.5-rc Jens Axboe
                   ` (2 preceding siblings ...)
  2019-12-10 15:57 ` [PATCH 03/11] io-wq: briefly spin for new work after finishing work Jens Axboe
@ 2019-12-10 15:57 ` Jens Axboe
  2019-12-10 15:57 ` [PATCH 05/11] io_uring: deferred send/recvmsg should assign iov Jens Axboe
                   ` (6 subsequent siblings)
  10 siblings, 0 replies; 25+ messages in thread
From: Jens Axboe @ 2019-12-10 15:57 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

We use the mutex to guard against registered file updates, for instance.
Ensure we're safe in accessing that state against concurrent updates.

Signed-off-by: Jens Axboe <[email protected]>
---
 fs/io_uring.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 4cda61fe67da..1e1e80d48145 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3002,12 +3002,7 @@ static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt,
 		if (req->result == -EAGAIN)
 			return -EAGAIN;
 
-		/* workqueue context doesn't hold uring_lock, grab it now */
-		if (req->in_async)
-			mutex_lock(&ctx->uring_lock);
 		io_iopoll_req_issued(req);
-		if (req->in_async)
-			mutex_unlock(&ctx->uring_lock);
 	}
 
 	return 0;
@@ -3661,7 +3656,9 @@ static int io_sq_thread(void *data)
 		}
 
 		to_submit = min(to_submit, ctx->sq_entries);
+		mutex_lock(&ctx->uring_lock);
 		ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true);
+		mutex_unlock(&ctx->uring_lock);
 		if (ret > 0)
 			inflight += ret;
 	}
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 05/11] io_uring: deferred send/recvmsg should assign iov
  2019-12-10 15:57 [PATCHSET 0/11] io_uring improvements/fixes for 5.5-rc Jens Axboe
                   ` (3 preceding siblings ...)
  2019-12-10 15:57 ` [PATCH 04/11] io_uring: sqthread should grab ctx->uring_lock for submissions Jens Axboe
@ 2019-12-10 15:57 ` Jens Axboe
  2019-12-10 15:57 ` [PATCH 06/11] io_uring: don't dynamically allocate poll data Jens Axboe
                   ` (5 subsequent siblings)
  10 siblings, 0 replies; 25+ messages in thread
From: Jens Axboe @ 2019-12-10 15:57 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

Don't just assign it from the main call path, that can miss the case
when we're called from issue deferral.

Signed-off-by: Jens Axboe <[email protected]>
---
 fs/io_uring.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1e1e80d48145..a0ed20e097d9 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2026,6 +2026,7 @@ static int io_sendmsg_prep(struct io_kiocb *req, struct io_async_ctx *io)
 
 	flags = READ_ONCE(sqe->msg_flags);
 	msg = (struct user_msghdr __user *)(unsigned long) READ_ONCE(sqe->addr);
+	io->msg.iov = io->msg.fast_iov;
 	return sendmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.iov);
 #else
 	return 0;
@@ -2061,7 +2062,6 @@ static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		} else {
 			kmsg = &io.msg.msg;
 			kmsg->msg_name = &addr;
-			io.msg.iov = io.msg.fast_iov;
 			ret = io_sendmsg_prep(req, &io);
 			if (ret)
 				goto out;
@@ -2104,6 +2104,7 @@ static int io_recvmsg_prep(struct io_kiocb *req, struct io_async_ctx *io)
 
 	flags = READ_ONCE(sqe->msg_flags);
 	msg = (struct user_msghdr __user *)(unsigned long) READ_ONCE(sqe->addr);
+	io->msg.iov = io->msg.fast_iov;
 	return recvmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.uaddr,
 					&io->msg.iov);
 #else
@@ -2143,7 +2144,6 @@ static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		} else {
 			kmsg = &io.msg.msg;
 			kmsg->msg_name = &addr;
-			io.msg.iov = io.msg.fast_iov;
 			ret = io_recvmsg_prep(req, &io);
 			if (ret)
 				goto out;
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 06/11] io_uring: don't dynamically allocate poll data
  2019-12-10 15:57 [PATCHSET 0/11] io_uring improvements/fixes for 5.5-rc Jens Axboe
                   ` (4 preceding siblings ...)
  2019-12-10 15:57 ` [PATCH 05/11] io_uring: deferred send/recvmsg should assign iov Jens Axboe
@ 2019-12-10 15:57 ` Jens Axboe
  2019-12-10 15:57 ` [PATCH 07/11] io_uring: use atomic_t for refcounts Jens Axboe
                   ` (4 subsequent siblings)
  10 siblings, 0 replies; 25+ messages in thread
From: Jens Axboe @ 2019-12-10 15:57 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

This essentially reverts commit e944475e6984. For high poll ops
workloads, like TAO, the dynamic allocation of the wait_queue
entry for IORING_OP_POLL_ADD adds considerable extra overhead.
Go back to embedding the wait_queue_entry, but keep the usage of
wait->private for the pointer stashing.

Signed-off-by: Jens Axboe <[email protected]>
---
 fs/io_uring.c | 27 +++++++++++----------------
 1 file changed, 11 insertions(+), 16 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index a0ed20e097d9..9a596b819334 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -293,7 +293,7 @@ struct io_poll_iocb {
 	__poll_t			events;
 	bool				done;
 	bool				canceled;
-	struct wait_queue_entry		*wait;
+	struct wait_queue_entry		wait;
 };
 
 struct io_timeout_data {
@@ -2286,8 +2286,8 @@ static void io_poll_remove_one(struct io_kiocb *req)
 
 	spin_lock(&poll->head->lock);
 	WRITE_ONCE(poll->canceled, true);
-	if (!list_empty(&poll->wait->entry)) {
-		list_del_init(&poll->wait->entry);
+	if (!list_empty(&poll->wait.entry)) {
+		list_del_init(&poll->wait.entry);
 		io_queue_async_work(req);
 	}
 	spin_unlock(&poll->head->lock);
@@ -2358,7 +2358,6 @@ static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
 	struct io_ring_ctx *ctx = req->ctx;
 
 	req->poll.done = true;
-	kfree(req->poll.wait);
 	if (error)
 		io_cqring_fill_event(req, error);
 	else
@@ -2396,7 +2395,7 @@ static void io_poll_complete_work(struct io_wq_work **workptr)
 	 */
 	spin_lock_irq(&ctx->completion_lock);
 	if (!mask && ret != -ECANCELED) {
-		add_wait_queue(poll->head, poll->wait);
+		add_wait_queue(poll->head, &poll->wait);
 		spin_unlock_irq(&ctx->completion_lock);
 		return;
 	}
@@ -2426,7 +2425,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 	if (mask && !(mask & poll->events))
 		return 0;
 
-	list_del_init(&poll->wait->entry);
+	list_del_init(&poll->wait.entry);
 
 	/*
 	 * Run completion inline if we can. We're using trylock here because
@@ -2467,7 +2466,7 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
 
 	pt->error = 0;
 	pt->req->poll.head = head;
-	add_wait_queue(head, pt->req->poll.wait);
+	add_wait_queue(head, &pt->req->poll.wait);
 }
 
 static void io_poll_req_insert(struct io_kiocb *req)
@@ -2496,10 +2495,6 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	if (!poll->file)
 		return -EBADF;
 
-	poll->wait = kmalloc(sizeof(*poll->wait), GFP_KERNEL);
-	if (!poll->wait)
-		return -ENOMEM;
-
 	req->io = NULL;
 	INIT_IO_WORK(&req->work, io_poll_complete_work);
 	events = READ_ONCE(sqe->poll_events);
@@ -2516,9 +2511,9 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
 
 	/* initialized the list so that we can do list_empty checks */
-	INIT_LIST_HEAD(&poll->wait->entry);
-	init_waitqueue_func_entry(poll->wait, io_poll_wake);
-	poll->wait->private = poll;
+	INIT_LIST_HEAD(&poll->wait.entry);
+	init_waitqueue_func_entry(&poll->wait, io_poll_wake);
+	poll->wait.private = poll;
 
 	INIT_LIST_HEAD(&req->list);
 
@@ -2527,14 +2522,14 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	spin_lock_irq(&ctx->completion_lock);
 	if (likely(poll->head)) {
 		spin_lock(&poll->head->lock);
-		if (unlikely(list_empty(&poll->wait->entry))) {
+		if (unlikely(list_empty(&poll->wait.entry))) {
 			if (ipt.error)
 				cancel = true;
 			ipt.error = 0;
 			mask = 0;
 		}
 		if (mask || ipt.error)
-			list_del_init(&poll->wait->entry);
+			list_del_init(&poll->wait.entry);
 		else if (cancel)
 			WRITE_ONCE(poll->canceled, true);
 		else if (!poll->done) /* actually waiting for an event */
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 07/11] io_uring: use atomic_t for refcounts
  2019-12-10 15:57 [PATCHSET 0/11] io_uring improvements/fixes for 5.5-rc Jens Axboe
                   ` (5 preceding siblings ...)
  2019-12-10 15:57 ` [PATCH 06/11] io_uring: don't dynamically allocate poll data Jens Axboe
@ 2019-12-10 15:57 ` Jens Axboe
  2019-12-10 22:04   ` Jann Horn
  2019-12-10 15:57 ` [PATCH 08/11] io_uring: run next sqe inline if possible Jens Axboe
                   ` (3 subsequent siblings)
  10 siblings, 1 reply; 25+ messages in thread
From: Jens Axboe @ 2019-12-10 15:57 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

Recently had a regression that turned out to be because
CONFIG_REFCOUNT_FULL was set. Our ref count usage is really simple,
so let's just use atomic_t and get rid of the dependency on the full
reference count checking being enabled or disabled.

Signed-off-by: Jens Axboe <[email protected]>
---
 fs/io_uring.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9a596b819334..05419a152b32 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -360,7 +360,7 @@ struct io_kiocb {
 	};
 	struct list_head	link_list;
 	unsigned int		flags;
-	refcount_t		refs;
+	atomic_t		refs;
 #define REQ_F_NOWAIT		1	/* must not punt to workers */
 #define REQ_F_IOPOLL_COMPLETED	2	/* polled IO has completed */
 #define REQ_F_FIXED_FILE	4	/* ctx owns file */
@@ -770,7 +770,7 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res)
 		WRITE_ONCE(ctx->rings->cq_overflow,
 				atomic_inc_return(&ctx->cached_cq_overflow));
 	} else {
-		refcount_inc(&req->refs);
+		atomic_inc(&req->refs);
 		req->result = res;
 		list_add_tail(&req->list, &ctx->cq_overflow_list);
 	}
@@ -852,7 +852,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
 	req->ctx = ctx;
 	req->flags = 0;
 	/* one is dropped after submission, the other at completion */
-	refcount_set(&req->refs, 2);
+	atomic_set(&req->refs, 2);
 	req->result = 0;
 	INIT_IO_WORK(&req->work, io_wq_submit_work);
 	return req;
@@ -1035,13 +1035,13 @@ static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
 {
 	io_req_find_next(req, nxtptr);
 
-	if (refcount_dec_and_test(&req->refs))
+	if (atomic_dec_and_test(&req->refs))
 		__io_free_req(req);
 }
 
 static void io_put_req(struct io_kiocb *req)
 {
-	if (refcount_dec_and_test(&req->refs))
+	if (atomic_dec_and_test(&req->refs))
 		io_free_req(req);
 }
 
@@ -1052,14 +1052,14 @@ static void io_put_req(struct io_kiocb *req)
 static void __io_double_put_req(struct io_kiocb *req)
 {
 	/* drop both submit and complete references */
-	if (refcount_sub_and_test(2, &req->refs))
+	if (atomic_sub_and_test(2, &req->refs))
 		__io_free_req(req);
 }
 
 static void io_double_put_req(struct io_kiocb *req)
 {
 	/* drop both submit and complete references */
-	if (refcount_sub_and_test(2, &req->refs))
+	if (atomic_sub_and_test(2, &req->refs))
 		io_free_req(req);
 }
 
@@ -1108,7 +1108,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 		io_cqring_fill_event(req, req->result);
 		(*nr_events)++;
 
-		if (refcount_dec_and_test(&req->refs)) {
+		if (atomic_dec_and_test(&req->refs)) {
 			/* If we're not using fixed files, we have to pair the
 			 * completion part with the file put. Use regular
 			 * completions for those, only batch free for fixed
@@ -3169,7 +3169,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 	if (!list_empty(&req->link_list)) {
 		prev = list_entry(req->link_list.prev, struct io_kiocb,
 				  link_list);
-		if (refcount_inc_not_zero(&prev->refs)) {
+		if (atomic_inc_not_zero(&prev->refs)) {
 			list_del_init(&req->link_list);
 			prev->flags &= ~REQ_F_LINK_TIMEOUT;
 		} else
@@ -4237,7 +4237,7 @@ static void io_get_work(struct io_wq_work *work)
 {
 	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
 
-	refcount_inc(&req->refs);
+	atomic_inc(&req->refs);
 }
 
 static int io_sq_offload_start(struct io_ring_ctx *ctx,
@@ -4722,7 +4722,7 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
 			if (req->work.files != files)
 				continue;
 			/* req is being completed, ignore */
-			if (!refcount_inc_not_zero(&req->refs))
+			if (!atomic_inc_not_zero(&req->refs))
 				continue;
 			cancel_req = req;
 			break;
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH 07/11] io_uring: use atomic_t for refcounts
  2019-12-10 15:57 ` [PATCH 07/11] io_uring: use atomic_t for refcounts Jens Axboe
@ 2019-12-10 22:04   ` Jann Horn
  2019-12-10 22:21     ` Jens Axboe
  0 siblings, 1 reply; 25+ messages in thread
From: Jann Horn @ 2019-12-10 22:04 UTC (permalink / raw)
  To: Jens Axboe; +Cc: io-uring, Will Deacon, Kees Cook, Kernel Hardening

[context preserved for additional CCs]

On Tue, Dec 10, 2019 at 4:57 PM Jens Axboe <[email protected]> wrote:
> Recently had a regression that turned out to be because
> CONFIG_REFCOUNT_FULL was set.

I assume "regression" here refers to a performance regression? Do you
have more concrete numbers on this? Is one of the refcounting calls
particularly problematic compared to the others?

I really don't like it when raw atomic_t is used for refcounting
purposes - not only because that gets rid of the overflow checks, but
also because it is less clear semantically.

> Our ref count usage is really simple,

In my opinion, for a refcount to qualify as "really simple", it must
be possible to annotate each relevant struct member and local variable
with the (fixed) bias it carries when alive and non-NULL. This
refcount is more complicated than that.

> so let's just use atomic_t and get rid of the dependency on the full
> reference count checking being enabled or disabled.
>
> Signed-off-by: Jens Axboe <[email protected]>
> ---
>  fs/io_uring.c | 22 +++++++++++-----------
>  1 file changed, 11 insertions(+), 11 deletions(-)
>
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 9a596b819334..05419a152b32 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -360,7 +360,7 @@ struct io_kiocb {
>         };
>         struct list_head        link_list;
>         unsigned int            flags;
> -       refcount_t              refs;
> +       atomic_t                refs;
>  #define REQ_F_NOWAIT           1       /* must not punt to workers */
>  #define REQ_F_IOPOLL_COMPLETED 2       /* polled IO has completed */
>  #define REQ_F_FIXED_FILE       4       /* ctx owns file */
> @@ -770,7 +770,7 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res)
>                 WRITE_ONCE(ctx->rings->cq_overflow,
>                                 atomic_inc_return(&ctx->cached_cq_overflow));
>         } else {
> -               refcount_inc(&req->refs);
> +               atomic_inc(&req->refs);
>                 req->result = res;
>                 list_add_tail(&req->list, &ctx->cq_overflow_list);
>         }
> @@ -852,7 +852,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
>         req->ctx = ctx;
>         req->flags = 0;
>         /* one is dropped after submission, the other at completion */
> -       refcount_set(&req->refs, 2);
> +       atomic_set(&req->refs, 2);
>         req->result = 0;
>         INIT_IO_WORK(&req->work, io_wq_submit_work);
>         return req;
> @@ -1035,13 +1035,13 @@ static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
>  {
>         io_req_find_next(req, nxtptr);
>
> -       if (refcount_dec_and_test(&req->refs))
> +       if (atomic_dec_and_test(&req->refs))
>                 __io_free_req(req);
>  }
>
>  static void io_put_req(struct io_kiocb *req)
>  {
> -       if (refcount_dec_and_test(&req->refs))
> +       if (atomic_dec_and_test(&req->refs))
>                 io_free_req(req);
>  }
>
> @@ -1052,14 +1052,14 @@ static void io_put_req(struct io_kiocb *req)
>  static void __io_double_put_req(struct io_kiocb *req)
>  {
>         /* drop both submit and complete references */
> -       if (refcount_sub_and_test(2, &req->refs))
> +       if (atomic_sub_and_test(2, &req->refs))
>                 __io_free_req(req);
>  }
>
>  static void io_double_put_req(struct io_kiocb *req)
>  {
>         /* drop both submit and complete references */
> -       if (refcount_sub_and_test(2, &req->refs))
> +       if (atomic_sub_and_test(2, &req->refs))
>                 io_free_req(req);
>  }
>
> @@ -1108,7 +1108,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
>                 io_cqring_fill_event(req, req->result);
>                 (*nr_events)++;
>
> -               if (refcount_dec_and_test(&req->refs)) {
> +               if (atomic_dec_and_test(&req->refs)) {
>                         /* If we're not using fixed files, we have to pair the
>                          * completion part with the file put. Use regular
>                          * completions for those, only batch free for fixed
> @@ -3169,7 +3169,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
>         if (!list_empty(&req->link_list)) {
>                 prev = list_entry(req->link_list.prev, struct io_kiocb,
>                                   link_list);
> -               if (refcount_inc_not_zero(&prev->refs)) {
> +               if (atomic_inc_not_zero(&prev->refs)) {
>                         list_del_init(&req->link_list);
>                         prev->flags &= ~REQ_F_LINK_TIMEOUT;
>                 } else
> @@ -4237,7 +4237,7 @@ static void io_get_work(struct io_wq_work *work)
>  {
>         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
>
> -       refcount_inc(&req->refs);
> +       atomic_inc(&req->refs);
>  }
>
>  static int io_sq_offload_start(struct io_ring_ctx *ctx,
> @@ -4722,7 +4722,7 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
>                         if (req->work.files != files)
>                                 continue;
>                         /* req is being completed, ignore */
> -                       if (!refcount_inc_not_zero(&req->refs))
> +                       if (!atomic_inc_not_zero(&req->refs))
>                                 continue;
>                         cancel_req = req;
>                         break;
> --
> 2.24.0
>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 07/11] io_uring: use atomic_t for refcounts
  2019-12-10 22:04   ` Jann Horn
@ 2019-12-10 22:21     ` Jens Axboe
  2019-12-10 22:46       ` Kees Cook
  0 siblings, 1 reply; 25+ messages in thread
From: Jens Axboe @ 2019-12-10 22:21 UTC (permalink / raw)
  To: Jann Horn; +Cc: io-uring, Will Deacon, Kees Cook, Kernel Hardening

On 12/10/19 3:04 PM, Jann Horn wrote:
> [context preserved for additional CCs]
> 
> On Tue, Dec 10, 2019 at 4:57 PM Jens Axboe <[email protected]> wrote:
>> Recently had a regression that turned out to be because
>> CONFIG_REFCOUNT_FULL was set.
> 
> I assume "regression" here refers to a performance regression? Do you
> have more concrete numbers on this? Is one of the refcounting calls
> particularly problematic compared to the others?

Yes, a performance regression. io_uring is using io-wq now, which does
an extra get/put on the work item to make it safe against async cancel.
That get/put translates into a refcount_inc and refcount_dec per work
item, and meant that we went from 0.5% refcount CPU in the test case to
1.5%. That's a pretty substantial increase.

> I really don't like it when raw atomic_t is used for refcounting
> purposes - not only because that gets rid of the overflow checks, but
> also because it is less clear semantically.

Not a huge fan either, but... It's hard to give up 1% of extra CPU. You
could argue I could just turn off REFCOUNT_FULL, and I could. Maybe
that's what I should do. But I'd prefer to just drop the refcount on the
io_uring side and keep it on for other potential useful cases.

>> Our ref count usage is really simple,
> 
> In my opinion, for a refcount to qualify as "really simple", it must
> be possible to annotate each relevant struct member and local variable
> with the (fixed) bias it carries when alive and non-NULL. This
> refcount is more complicated than that.

:-(

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 07/11] io_uring: use atomic_t for refcounts
  2019-12-10 22:21     ` Jens Axboe
@ 2019-12-10 22:46       ` Kees Cook
  2019-12-10 22:55         ` Jens Axboe
  0 siblings, 1 reply; 25+ messages in thread
From: Kees Cook @ 2019-12-10 22:46 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Jann Horn, io-uring, Will Deacon, Kernel Hardening

On Tue, Dec 10, 2019 at 03:21:04PM -0700, Jens Axboe wrote:
> On 12/10/19 3:04 PM, Jann Horn wrote:
> > [context preserved for additional CCs]
> > 
> > On Tue, Dec 10, 2019 at 4:57 PM Jens Axboe <[email protected]> wrote:
> >> Recently had a regression that turned out to be because
> >> CONFIG_REFCOUNT_FULL was set.
> > 
> > I assume "regression" here refers to a performance regression? Do you
> > have more concrete numbers on this? Is one of the refcounting calls
> > particularly problematic compared to the others?
> 
> Yes, a performance regression. io_uring is using io-wq now, which does
> an extra get/put on the work item to make it safe against async cancel.
> That get/put translates into a refcount_inc and refcount_dec per work
> item, and meant that we went from 0.5% refcount CPU in the test case to
> 1.5%. That's a pretty substantial increase.
> 
> > I really don't like it when raw atomic_t is used for refcounting
> > purposes - not only because that gets rid of the overflow checks, but
> > also because it is less clear semantically.
> 
> Not a huge fan either, but... It's hard to give up 1% of extra CPU. You
> could argue I could just turn off REFCOUNT_FULL, and I could. Maybe
> that's what I should do. But I'd prefer to just drop the refcount on the
> io_uring side and keep it on for other potential useful cases.

There is no CONFIG_REFCOUNT_FULL any more. Will Deacon's version came
out as nearly identical to the x86 asm version. Can you share the
workload where you saw this? We really don't want to regression refcount
protections, especially in the face of new APIs.

Will, do you have a moment to dig into this?

-Kees

> 
> >> Our ref count usage is really simple,
> > 
> > In my opinion, for a refcount to qualify as "really simple", it must
> > be possible to annotate each relevant struct member and local variable
> > with the (fixed) bias it carries when alive and non-NULL. This
> > refcount is more complicated than that.
> 
> :-(
> 
> -- 
> Jens Axboe
> 

-- 
Kees Cook

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 07/11] io_uring: use atomic_t for refcounts
  2019-12-10 22:46       ` Kees Cook
@ 2019-12-10 22:55         ` Jens Axboe
  2019-12-11 10:20           ` Will Deacon
  0 siblings, 1 reply; 25+ messages in thread
From: Jens Axboe @ 2019-12-10 22:55 UTC (permalink / raw)
  To: Kees Cook; +Cc: Jann Horn, io-uring, Will Deacon, Kernel Hardening

On 12/10/19 3:46 PM, Kees Cook wrote:
> On Tue, Dec 10, 2019 at 03:21:04PM -0700, Jens Axboe wrote:
>> On 12/10/19 3:04 PM, Jann Horn wrote:
>>> [context preserved for additional CCs]
>>>
>>> On Tue, Dec 10, 2019 at 4:57 PM Jens Axboe <[email protected]> wrote:
>>>> Recently had a regression that turned out to be because
>>>> CONFIG_REFCOUNT_FULL was set.
>>>
>>> I assume "regression" here refers to a performance regression? Do you
>>> have more concrete numbers on this? Is one of the refcounting calls
>>> particularly problematic compared to the others?
>>
>> Yes, a performance regression. io_uring is using io-wq now, which does
>> an extra get/put on the work item to make it safe against async cancel.
>> That get/put translates into a refcount_inc and refcount_dec per work
>> item, and meant that we went from 0.5% refcount CPU in the test case to
>> 1.5%. That's a pretty substantial increase.
>>
>>> I really don't like it when raw atomic_t is used for refcounting
>>> purposes - not only because that gets rid of the overflow checks, but
>>> also because it is less clear semantically.
>>
>> Not a huge fan either, but... It's hard to give up 1% of extra CPU. You
>> could argue I could just turn off REFCOUNT_FULL, and I could. Maybe
>> that's what I should do. But I'd prefer to just drop the refcount on the
>> io_uring side and keep it on for other potential useful cases.
> 
> There is no CONFIG_REFCOUNT_FULL any more. Will Deacon's version came
> out as nearly identical to the x86 asm version. Can you share the
> workload where you saw this? We really don't want to regression refcount
> protections, especially in the face of new APIs.
> 
> Will, do you have a moment to dig into this?

Ah, hopefully it'll work out ok, then. The patch came from testing the
full backport on 5.2.

Do you have a link to the "nearly identical"? I can backport that
patch and try on 5.2.


-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 07/11] io_uring: use atomic_t for refcounts
  2019-12-10 22:55         ` Jens Axboe
@ 2019-12-11 10:20           ` Will Deacon
  2019-12-11 16:56             ` Kees Cook
  0 siblings, 1 reply; 25+ messages in thread
From: Will Deacon @ 2019-12-11 10:20 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Kees Cook, Jann Horn, io-uring, Kernel Hardening

On Tue, Dec 10, 2019 at 03:55:05PM -0700, Jens Axboe wrote:
> On 12/10/19 3:46 PM, Kees Cook wrote:
> > On Tue, Dec 10, 2019 at 03:21:04PM -0700, Jens Axboe wrote:
> >> On 12/10/19 3:04 PM, Jann Horn wrote:
> >>> [context preserved for additional CCs]
> >>>
> >>> On Tue, Dec 10, 2019 at 4:57 PM Jens Axboe <[email protected]> wrote:
> >>>> Recently had a regression that turned out to be because
> >>>> CONFIG_REFCOUNT_FULL was set.
> >>>
> >>> I assume "regression" here refers to a performance regression? Do you
> >>> have more concrete numbers on this? Is one of the refcounting calls
> >>> particularly problematic compared to the others?
> >>
> >> Yes, a performance regression. io_uring is using io-wq now, which does
> >> an extra get/put on the work item to make it safe against async cancel.
> >> That get/put translates into a refcount_inc and refcount_dec per work
> >> item, and meant that we went from 0.5% refcount CPU in the test case to
> >> 1.5%. That's a pretty substantial increase.
> >>
> >>> I really don't like it when raw atomic_t is used for refcounting
> >>> purposes - not only because that gets rid of the overflow checks, but
> >>> also because it is less clear semantically.
> >>
> >> Not a huge fan either, but... It's hard to give up 1% of extra CPU. You
> >> could argue I could just turn off REFCOUNT_FULL, and I could. Maybe
> >> that's what I should do. But I'd prefer to just drop the refcount on the
> >> io_uring side and keep it on for other potential useful cases.
> > 
> > There is no CONFIG_REFCOUNT_FULL any more. Will Deacon's version came
> > out as nearly identical to the x86 asm version. Can you share the
> > workload where you saw this? We really don't want to regression refcount
> > protections, especially in the face of new APIs.
> > 
> > Will, do you have a moment to dig into this?
> 
> Ah, hopefully it'll work out ok, then. The patch came from testing the
> full backport on 5.2.
> 
> Do you have a link to the "nearly identical"? I can backport that
> patch and try on 5.2.

You could try my refcount/full branch, which is what ended up getting merged
during the merge window:

https://git.kernel.org/pub/scm/linux/kernel/git/will/linux.git/log/?h=refcount/full

Will

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 07/11] io_uring: use atomic_t for refcounts
  2019-12-11 10:20           ` Will Deacon
@ 2019-12-11 16:56             ` Kees Cook
  2019-12-11 17:00               ` Jens Axboe
  0 siblings, 1 reply; 25+ messages in thread
From: Kees Cook @ 2019-12-11 16:56 UTC (permalink / raw)
  To: Will Deacon; +Cc: Jens Axboe, Jann Horn, io-uring, Kernel Hardening

On Wed, Dec 11, 2019 at 10:20:13AM +0000, Will Deacon wrote:
> On Tue, Dec 10, 2019 at 03:55:05PM -0700, Jens Axboe wrote:
> > On 12/10/19 3:46 PM, Kees Cook wrote:
> > > On Tue, Dec 10, 2019 at 03:21:04PM -0700, Jens Axboe wrote:
> > >> On 12/10/19 3:04 PM, Jann Horn wrote:
> > >>> [context preserved for additional CCs]
> > >>>
> > >>> On Tue, Dec 10, 2019 at 4:57 PM Jens Axboe <[email protected]> wrote:
> > >>>> Recently had a regression that turned out to be because
> > >>>> CONFIG_REFCOUNT_FULL was set.
> > >>>
> > >>> I assume "regression" here refers to a performance regression? Do you
> > >>> have more concrete numbers on this? Is one of the refcounting calls
> > >>> particularly problematic compared to the others?
> > >>
> > >> Yes, a performance regression. io_uring is using io-wq now, which does
> > >> an extra get/put on the work item to make it safe against async cancel.
> > >> That get/put translates into a refcount_inc and refcount_dec per work
> > >> item, and meant that we went from 0.5% refcount CPU in the test case to
> > >> 1.5%. That's a pretty substantial increase.
> > >>
> > >>> I really don't like it when raw atomic_t is used for refcounting
> > >>> purposes - not only because that gets rid of the overflow checks, but
> > >>> also because it is less clear semantically.
> > >>
> > >> Not a huge fan either, but... It's hard to give up 1% of extra CPU. You
> > >> could argue I could just turn off REFCOUNT_FULL, and I could. Maybe
> > >> that's what I should do. But I'd prefer to just drop the refcount on the
> > >> io_uring side and keep it on for other potential useful cases.
> > > 
> > > There is no CONFIG_REFCOUNT_FULL any more. Will Deacon's version came
> > > out as nearly identical to the x86 asm version. Can you share the
> > > workload where you saw this? We really don't want to regression refcount
> > > protections, especially in the face of new APIs.
> > > 
> > > Will, do you have a moment to dig into this?
> > 
> > Ah, hopefully it'll work out ok, then. The patch came from testing the
> > full backport on 5.2.

Oh good! I thought we had some kind of impossible workload. :)

> > Do you have a link to the "nearly identical"? I can backport that
> > patch and try on 5.2.
> 
> You could try my refcount/full branch, which is what ended up getting merged
> during the merge window:
> 
> https://git.kernel.org/pub/scm/linux/kernel/git/will/linux.git/log/?h=refcount/full

Yeah, as you can see in the measured tight-loop timings in
https://git.kernel.org/linus/dcb786493f3e48da3272b710028d42ec608cfda1
there was 0.1% difference for Will's series compared to the x86 assembly
version, where as the old FULL was almost 70%.

-- 
Kees Cook

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 07/11] io_uring: use atomic_t for refcounts
  2019-12-11 16:56             ` Kees Cook
@ 2019-12-11 17:00               ` Jens Axboe
  0 siblings, 0 replies; 25+ messages in thread
From: Jens Axboe @ 2019-12-11 17:00 UTC (permalink / raw)
  To: Kees Cook, Will Deacon; +Cc: Jann Horn, io-uring, Kernel Hardening

On 12/11/19 9:56 AM, Kees Cook wrote:
> On Wed, Dec 11, 2019 at 10:20:13AM +0000, Will Deacon wrote:
>> On Tue, Dec 10, 2019 at 03:55:05PM -0700, Jens Axboe wrote:
>>> On 12/10/19 3:46 PM, Kees Cook wrote:
>>>> On Tue, Dec 10, 2019 at 03:21:04PM -0700, Jens Axboe wrote:
>>>>> On 12/10/19 3:04 PM, Jann Horn wrote:
>>>>>> [context preserved for additional CCs]
>>>>>>
>>>>>> On Tue, Dec 10, 2019 at 4:57 PM Jens Axboe <[email protected]> wrote:
>>>>>>> Recently had a regression that turned out to be because
>>>>>>> CONFIG_REFCOUNT_FULL was set.
>>>>>>
>>>>>> I assume "regression" here refers to a performance regression? Do you
>>>>>> have more concrete numbers on this? Is one of the refcounting calls
>>>>>> particularly problematic compared to the others?
>>>>>
>>>>> Yes, a performance regression. io_uring is using io-wq now, which does
>>>>> an extra get/put on the work item to make it safe against async cancel.
>>>>> That get/put translates into a refcount_inc and refcount_dec per work
>>>>> item, and meant that we went from 0.5% refcount CPU in the test case to
>>>>> 1.5%. That's a pretty substantial increase.
>>>>>
>>>>>> I really don't like it when raw atomic_t is used for refcounting
>>>>>> purposes - not only because that gets rid of the overflow checks, but
>>>>>> also because it is less clear semantically.
>>>>>
>>>>> Not a huge fan either, but... It's hard to give up 1% of extra CPU. You
>>>>> could argue I could just turn off REFCOUNT_FULL, and I could. Maybe
>>>>> that's what I should do. But I'd prefer to just drop the refcount on the
>>>>> io_uring side and keep it on for other potential useful cases.
>>>>
>>>> There is no CONFIG_REFCOUNT_FULL any more. Will Deacon's version came
>>>> out as nearly identical to the x86 asm version. Can you share the
>>>> workload where you saw this? We really don't want to regression refcount
>>>> protections, especially in the face of new APIs.
>>>>
>>>> Will, do you have a moment to dig into this?
>>>
>>> Ah, hopefully it'll work out ok, then. The patch came from testing the
>>> full backport on 5.2.
> 
> Oh good! I thought we had some kind of impossible workload. :)
> 
>>> Do you have a link to the "nearly identical"? I can backport that
>>> patch and try on 5.2.
>>
>> You could try my refcount/full branch, which is what ended up getting merged
>> during the merge window:
>>
>> https://git.kernel.org/pub/scm/linux/kernel/git/will/linux.git/log/?h=refcount/full
> 
> Yeah, as you can see in the measured tight-loop timings in
> https://git.kernel.org/linus/dcb786493f3e48da3272b710028d42ec608cfda1
> there was 0.1% difference for Will's series compared to the x86 assembly
> version, where as the old FULL was almost 70%.

That looks very promising! Hopefully the patch is moot at that point, I
dropped it from the series yesterday in any case. I'll revisit as soon
as I can and holler if there's an issue.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH 08/11] io_uring: run next sqe inline if possible
  2019-12-10 15:57 [PATCHSET 0/11] io_uring improvements/fixes for 5.5-rc Jens Axboe
                   ` (6 preceding siblings ...)
  2019-12-10 15:57 ` [PATCH 07/11] io_uring: use atomic_t for refcounts Jens Axboe
@ 2019-12-10 15:57 ` Jens Axboe
  2019-12-10 15:57 ` [PATCH 09/11] io_uring: only hash regular files for async work execution Jens Axboe
                   ` (2 subsequent siblings)
  10 siblings, 0 replies; 25+ messages in thread
From: Jens Axboe @ 2019-12-10 15:57 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

One major use case of linked commands is the ability to run the next
link inline, if at all possible. This is done correctly for async
offload, but somewhere along the line we lost the ability to do so when
we were able to complete a request without having to punt it. Ensure
that we do so correctly.

Signed-off-by: Jens Axboe <[email protected]>
---
 fs/io_uring.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 05419a152b32..4fd65ff230eb 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3230,13 +3230,14 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
 
 static void __io_queue_sqe(struct io_kiocb *req)
 {
-	struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
+	struct io_kiocb *linked_timeout;
 	struct io_kiocb *nxt = NULL;
 	int ret;
 
+again:
+	linked_timeout = io_prep_linked_timeout(req);
+
 	ret = io_issue_sqe(req, &nxt, true);
-	if (nxt)
-		io_queue_async_work(nxt);
 
 	/*
 	 * We async punt it if the file wasn't marked NOWAIT, or if the file
@@ -3255,7 +3256,7 @@ static void __io_queue_sqe(struct io_kiocb *req)
 		 * submit reference when the iocb is actually submitted.
 		 */
 		io_queue_async_work(req);
-		return;
+		goto done_req;
 	}
 
 err:
@@ -3275,6 +3276,12 @@ static void __io_queue_sqe(struct io_kiocb *req)
 		req_set_fail_links(req);
 		io_put_req(req);
 	}
+done_req:
+	if (nxt) {
+		req = nxt;
+		nxt = NULL;
+		goto again;
+	}
 }
 
 static void io_queue_sqe(struct io_kiocb *req)
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 09/11] io_uring: only hash regular files for async work execution
  2019-12-10 15:57 [PATCHSET 0/11] io_uring improvements/fixes for 5.5-rc Jens Axboe
                   ` (7 preceding siblings ...)
  2019-12-10 15:57 ` [PATCH 08/11] io_uring: run next sqe inline if possible Jens Axboe
@ 2019-12-10 15:57 ` Jens Axboe
  2019-12-10 15:57 ` [PATCH 10/11] net: make socket read/write_iter() honor IOCB_NOWAIT Jens Axboe
  2019-12-10 15:57 ` [PATCH 11/11] io_uring: add sockets to list of files that support non-blocking issue Jens Axboe
  10 siblings, 0 replies; 25+ messages in thread
From: Jens Axboe @ 2019-12-10 15:57 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

We hash regular files to avoid having multiple threads hammer on the
inode mutex, but it should not be needed on other types of files
(like sockets).

Signed-off-by: Jens Axboe <[email protected]>
---
 fs/io_uring.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 4fd65ff230eb..8258a2f299e3 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -581,7 +581,9 @@ static inline bool io_prep_async_work(struct io_kiocb *req,
 		switch (req->sqe->opcode) {
 		case IORING_OP_WRITEV:
 		case IORING_OP_WRITE_FIXED:
-			do_hashed = true;
+			/* only regular files should be hashed for writes */
+			if (req->flags & REQ_F_ISREG)
+				do_hashed = true;
 			/* fall-through */
 		case IORING_OP_READV:
 		case IORING_OP_READ_FIXED:
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 10/11] net: make socket read/write_iter() honor IOCB_NOWAIT
  2019-12-10 15:57 [PATCHSET 0/11] io_uring improvements/fixes for 5.5-rc Jens Axboe
                   ` (8 preceding siblings ...)
  2019-12-10 15:57 ` [PATCH 09/11] io_uring: only hash regular files for async work execution Jens Axboe
@ 2019-12-10 15:57 ` Jens Axboe
  2019-12-10 19:37   ` David Miller
  2019-12-10 15:57 ` [PATCH 11/11] io_uring: add sockets to list of files that support non-blocking issue Jens Axboe
  10 siblings, 1 reply; 25+ messages in thread
From: Jens Axboe @ 2019-12-10 15:57 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe, David Miller, netdev

The socket read/write helpers only look at the file O_NONBLOCK. not
the iocb IOCB_NOWAIT flag. This breaks users like preadv2/pwritev2
and io_uring that rely on not having the file itself marked nonblocking,
but rather the iocb itself.

Cc: David Miller <[email protected]>
Cc: [email protected]
Signed-off-by: Jens Axboe <[email protected]>
---
 net/socket.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/socket.c b/net/socket.c
index b343db1489bd..b116e58d6438 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -957,7 +957,7 @@ static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to)
 			     .msg_iocb = iocb};
 	ssize_t res;
 
-	if (file->f_flags & O_NONBLOCK)
+	if (file->f_flags & O_NONBLOCK || (iocb->ki_flags & IOCB_NOWAIT))
 		msg.msg_flags = MSG_DONTWAIT;
 
 	if (iocb->ki_pos != 0)
@@ -982,7 +982,7 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (iocb->ki_pos != 0)
 		return -ESPIPE;
 
-	if (file->f_flags & O_NONBLOCK)
+	if (file->f_flags & O_NONBLOCK || (iocb->ki_flags & IOCB_NOWAIT))
 		msg.msg_flags = MSG_DONTWAIT;
 
 	if (sock->type == SOCK_SEQPACKET)
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH 10/11] net: make socket read/write_iter() honor IOCB_NOWAIT
  2019-12-10 15:57 ` [PATCH 10/11] net: make socket read/write_iter() honor IOCB_NOWAIT Jens Axboe
@ 2019-12-10 19:37   ` David Miller
  2019-12-10 20:43     ` Jens Axboe
  0 siblings, 1 reply; 25+ messages in thread
From: David Miller @ 2019-12-10 19:37 UTC (permalink / raw)
  To: axboe; +Cc: io-uring, netdev

From: Jens Axboe <[email protected]>
Date: Tue, 10 Dec 2019 08:57:41 -0700

> The socket read/write helpers only look at the file O_NONBLOCK. not
> the iocb IOCB_NOWAIT flag. This breaks users like preadv2/pwritev2
> and io_uring that rely on not having the file itself marked nonblocking,
> but rather the iocb itself.
> 
> Cc: David Miller <[email protected]>
> Cc: [email protected]
> Signed-off-by: Jens Axboe <[email protected]>

I guess this should be OK:

Acked-by: David S. Miller <[email protected]>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 10/11] net: make socket read/write_iter() honor IOCB_NOWAIT
  2019-12-10 19:37   ` David Miller
@ 2019-12-10 20:43     ` Jens Axboe
  0 siblings, 0 replies; 25+ messages in thread
From: Jens Axboe @ 2019-12-10 20:43 UTC (permalink / raw)
  To: David Miller; +Cc: io-uring, netdev

On 12/10/19 12:37 PM, David Miller wrote:
> From: Jens Axboe <[email protected]>
> Date: Tue, 10 Dec 2019 08:57:41 -0700
> 
>> The socket read/write helpers only look at the file O_NONBLOCK. not
>> the iocb IOCB_NOWAIT flag. This breaks users like preadv2/pwritev2
>> and io_uring that rely on not having the file itself marked nonblocking,
>> but rather the iocb itself.
>>
>> Cc: David Miller <[email protected]>
>> Cc: [email protected]
>> Signed-off-by: Jens Axboe <[email protected]>
> 
> I guess this should be OK:
> 
> Acked-by: David S. Miller <[email protected]>

Thanks for reviewing!

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH 11/11] io_uring: add sockets to list of files that support non-blocking issue
  2019-12-10 15:57 [PATCHSET 0/11] io_uring improvements/fixes for 5.5-rc Jens Axboe
                   ` (9 preceding siblings ...)
  2019-12-10 15:57 ` [PATCH 10/11] net: make socket read/write_iter() honor IOCB_NOWAIT Jens Axboe
@ 2019-12-10 15:57 ` Jens Axboe
  10 siblings, 0 replies; 25+ messages in thread
From: Jens Axboe @ 2019-12-10 15:57 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

In chasing a performance issue between using IORING_OP_RECVMSG and
IORING_OP_READV on sockets, tracing showed that we always punt the
socket reads to async offload. This is due to io_file_supports_async()
not checking for S_ISSOCK on the inode. Since sockets supports the
O_NONBLOCK (or MSG_DONTWAIT) flag just fine, add sockets to the list
of file types that we can do a non-blocking issue to.

Signed-off-by: Jens Axboe <[email protected]>
---
 fs/io_uring.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8258a2f299e3..80e02957ae9e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1431,7 +1431,7 @@ static bool io_file_supports_async(struct file *file)
 {
 	umode_t mode = file_inode(file)->i_mode;
 
-	if (S_ISBLK(mode) || S_ISCHR(mode))
+	if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISSOCK(mode))
 		return true;
 	if (S_ISREG(mode) && file->f_op != &io_uring_fops)
 		return true;
@@ -1867,7 +1867,9 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
 		goto copy_iov;
 	}
 
-	if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT))
+	/* file path doesn't support NOWAIT for non-direct_IO */
+	if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
+	    (req->flags & REQ_F_ISREG))
 		goto copy_iov;
 
 	iov_count = iov_iter_count(&iter);
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2019-12-11 17:00 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2019-12-10 15:57 [PATCHSET 0/11] io_uring improvements/fixes for 5.5-rc Jens Axboe
2019-12-10 15:57 ` [PATCH 01/11] io_uring: allow unbreakable links Jens Axboe
2019-12-10 21:10   ` Pavel Begunkov
2019-12-10 21:12     ` Jens Axboe
2019-12-10 21:28       ` Pavel Begunkov
2019-12-10 22:17         ` Jens Axboe
2019-12-10 15:57 ` [PATCH 02/11] io-wq: remove worker->wait waitqueue Jens Axboe
2019-12-10 15:57 ` [PATCH 03/11] io-wq: briefly spin for new work after finishing work Jens Axboe
2019-12-10 15:57 ` [PATCH 04/11] io_uring: sqthread should grab ctx->uring_lock for submissions Jens Axboe
2019-12-10 15:57 ` [PATCH 05/11] io_uring: deferred send/recvmsg should assign iov Jens Axboe
2019-12-10 15:57 ` [PATCH 06/11] io_uring: don't dynamically allocate poll data Jens Axboe
2019-12-10 15:57 ` [PATCH 07/11] io_uring: use atomic_t for refcounts Jens Axboe
2019-12-10 22:04   ` Jann Horn
2019-12-10 22:21     ` Jens Axboe
2019-12-10 22:46       ` Kees Cook
2019-12-10 22:55         ` Jens Axboe
2019-12-11 10:20           ` Will Deacon
2019-12-11 16:56             ` Kees Cook
2019-12-11 17:00               ` Jens Axboe
2019-12-10 15:57 ` [PATCH 08/11] io_uring: run next sqe inline if possible Jens Axboe
2019-12-10 15:57 ` [PATCH 09/11] io_uring: only hash regular files for async work execution Jens Axboe
2019-12-10 15:57 ` [PATCH 10/11] net: make socket read/write_iter() honor IOCB_NOWAIT Jens Axboe
2019-12-10 19:37   ` David Miller
2019-12-10 20:43     ` Jens Axboe
2019-12-10 15:57 ` [PATCH 11/11] io_uring: add sockets to list of files that support non-blocking issue Jens Axboe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox