* [PATCH RFC] io_uring: limit inflight IO
@ 2019-11-07 23:21 Jens Axboe
2019-11-08 0:19 ` Jens Axboe
0 siblings, 1 reply; 11+ messages in thread
From: Jens Axboe @ 2019-11-07 23:21 UTC (permalink / raw)
To: io-uring
I'd like some feedback on this one. Even tith the overflow backpressure
patch, we still have a potentially large gap where applications can
submit IO before we get any dropped events in the CQ ring. This is
especially true if the execution time of those requests are long
(unbounded).
This adds IORING_SETUP_INFLIGHT, which if set, will return -EBUSY if we
have more IO pending than we can feasibly support. This is normally the
CQ ring size, but of IORING_SETUP_CQ_NODROP is enabled, then it's twice
the CQ ring size.
This helps manage the pending queue size instead of letting it grow
indefinitely.
Note that we could potentially just make this the default behavior -
applications need to handle -EAGAIN returns already, in case we run out
of memory, and if we change this to return -EAGAIN as well, then it
doesn't introduce any new failure cases. I'm tempted to do that...
Anyway, comments solicited!
Not-yet-signed-off-by
---
diff --git a/fs/io_uring.c b/fs/io_uring.c
index f8344f95817e..db8b7e06f36d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -203,6 +203,7 @@ struct io_ring_ctx {
unsigned sq_mask;
unsigned sq_thread_idle;
unsigned cached_sq_dropped;
+ atomic_t cached_cq_overflow;
struct io_uring_sqe *sq_sqes;
struct list_head defer_list;
@@ -221,13 +222,12 @@ struct io_ring_ctx {
struct {
unsigned cached_cq_tail;
- atomic_t cached_cq_overflow;
unsigned cq_entries;
unsigned cq_mask;
+ atomic_t cq_timeouts;
struct wait_queue_head cq_wait;
struct fasync_struct *cq_fasync;
struct eventfd_ctx *cq_ev_fd;
- atomic_t cq_timeouts;
} ____cacheline_aligned_in_smp;
struct io_rings *rings;
@@ -705,23 +705,53 @@ static void io_cqring_add_event(struct io_kiocb *req, long res)
io_cqring_ev_posted(ctx);
}
+static bool io_req_over_limit(struct io_ring_ctx *ctx)
+{
+ unsigned limit, inflight;
+
+ if (!(ctx->flags & IORING_SETUP_INFLIGHT))
+ return false;
+ /* only do checks every once in a while */
+ if (ctx->cached_sq_head & ctx->sq_mask)
+ return false;
+
+ if (ctx->flags & IORING_SETUP_CQ_NODROP)
+ limit = 2 * ctx->cq_entries;
+ else
+ limit = ctx->cq_entries;
+
+ inflight = ctx->cached_sq_head -
+ (ctx->cached_cq_tail + atomic_read(&ctx->cached_cq_overflow));
+ return inflight >= limit;
+}
+
static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
- struct io_submit_state *state)
+ struct io_submit_state *state, bool force)
{
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
struct io_kiocb *req;
if (!percpu_ref_tryget(&ctx->refs))
- return NULL;
+ return ERR_PTR(-ENXIO);
if (!state) {
+ if (!force && io_req_over_limit(ctx)) {
+ req = ERR_PTR(-EBUSY);
+ goto out;
+ }
req = kmem_cache_alloc(req_cachep, gfp);
- if (unlikely(!req))
+ if (unlikely(!req)) {
+ req = ERR_PTR(-EAGAIN);
goto out;
+ }
} else if (!state->free_reqs) {
size_t sz;
int ret;
+ if (!force && io_req_over_limit(ctx)) {
+ req = ERR_PTR(-EBUSY);
+ goto out;
+ }
sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);
@@ -731,8 +761,10 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
*/
if (unlikely(ret <= 0)) {
state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
- if (!state->reqs[0])
+ if (!state->reqs[0]) {
+ req = ERR_PTR(-EAGAIN);
goto out;
+ }
ret = 1;
}
state->free_reqs = ret - 1;
@@ -754,7 +786,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
return req;
out:
percpu_ref_put(&ctx->refs);
- return NULL;
+ return req;
}
static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
@@ -2963,10 +2995,11 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
struct io_kiocb *req;
unsigned int sqe_flags;
- req = io_get_req(ctx, statep);
- if (unlikely(!req)) {
+ req = io_get_req(ctx, statep, false);
+ if (unlikely(IS_ERR(req))) {
if (!submitted)
- submitted = -EAGAIN;
+ submitted = PTR_ERR(req);
+ req = NULL;
break;
}
if (!io_get_sqring(ctx, &req->submit)) {
@@ -2986,9 +3019,11 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
if (link && (sqe_flags & IOSQE_IO_DRAIN)) {
if (!shadow_req) {
- shadow_req = io_get_req(ctx, NULL);
- if (unlikely(!shadow_req))
+ shadow_req = io_get_req(ctx, NULL, true);
+ if (unlikely(IS_ERR(shadow_req))) {
+ shadow_req = NULL;
goto out;
+ }
shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN);
refcount_dec(&shadow_req->refs);
}
@@ -4501,7 +4536,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
- IORING_SETUP_CQ_NODROP))
+ IORING_SETUP_CQ_NODROP | IORING_SETUP_INFLIGHT))
return -EINVAL;
ret = io_uring_create(entries, &p);
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 3d8517eb376e..e7d8e16f9e22 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -57,6 +57,7 @@ struct io_uring_sqe {
#define IORING_SETUP_SQ_AFF (1U << 2) /* sq_thread_cpu is valid */
#define IORING_SETUP_CQSIZE (1U << 3) /* app defines CQ size */
#define IORING_SETUP_CQ_NODROP (1U << 4) /* no CQ drops */
+#define IORING_SETUP_INFLIGHT (1U << 5) /* reject IO over limit */
#define IORING_OP_NOP 0
#define IORING_OP_READV 1
--
Jens Axboe
^ permalink raw reply related [flat|nested] 11+ messages in thread
* Re: [PATCH RFC] io_uring: limit inflight IO
2019-11-07 23:21 [PATCH RFC] io_uring: limit inflight IO Jens Axboe
@ 2019-11-08 0:19 ` Jens Axboe
2019-11-08 9:56 ` Pavel Begunkov
0 siblings, 1 reply; 11+ messages in thread
From: Jens Axboe @ 2019-11-08 0:19 UTC (permalink / raw)
To: io-uring
On 11/7/19 4:21 PM, Jens Axboe wrote:
> I'd like some feedback on this one. Even tith the overflow backpressure
> patch, we still have a potentially large gap where applications can
> submit IO before we get any dropped events in the CQ ring. This is
> especially true if the execution time of those requests are long
> (unbounded).
>
> This adds IORING_SETUP_INFLIGHT, which if set, will return -EBUSY if we
> have more IO pending than we can feasibly support. This is normally the
> CQ ring size, but of IORING_SETUP_CQ_NODROP is enabled, then it's twice
> the CQ ring size.
>
> This helps manage the pending queue size instead of letting it grow
> indefinitely.
>
> Note that we could potentially just make this the default behavior -
> applications need to handle -EAGAIN returns already, in case we run out
> of memory, and if we change this to return -EAGAIN as well, then it
> doesn't introduce any new failure cases. I'm tempted to do that...
>
> Anyway, comments solicited!
After a little deliberation, I think we should go with the one that
doesn't require users to opt-in. As mentioned, let's change it to
-EAGAIN to not introduce a new errno for this. They essentially mean
the same thing anyway.
diff --git a/fs/io_uring.c b/fs/io_uring.c
index f8344f95817e..4c488bf6e889 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -203,6 +203,7 @@ struct io_ring_ctx {
unsigned sq_mask;
unsigned sq_thread_idle;
unsigned cached_sq_dropped;
+ atomic_t cached_cq_overflow;
struct io_uring_sqe *sq_sqes;
struct list_head defer_list;
@@ -221,13 +222,12 @@ struct io_ring_ctx {
struct {
unsigned cached_cq_tail;
- atomic_t cached_cq_overflow;
unsigned cq_entries;
unsigned cq_mask;
+ atomic_t cq_timeouts;
struct wait_queue_head cq_wait;
struct fasync_struct *cq_fasync;
struct eventfd_ctx *cq_ev_fd;
- atomic_t cq_timeouts;
} ____cacheline_aligned_in_smp;
struct io_rings *rings;
@@ -705,16 +705,39 @@ static void io_cqring_add_event(struct io_kiocb *req, long res)
io_cqring_ev_posted(ctx);
}
+static bool io_req_over_limit(struct io_ring_ctx *ctx)
+{
+ unsigned limit, inflight;
+
+ /*
+ * This doesn't need to be super precise, so only check every once
+ * in a while.
+ */
+ if (ctx->cached_sq_head & ctx->sq_mask)
+ return false;
+
+ if (ctx->flags & IORING_SETUP_CQ_NODROP)
+ limit = 2 * ctx->cq_entries;
+ else
+ limit = ctx->cq_entries;
+
+ inflight = ctx->cached_sq_head -
+ (ctx->cached_cq_tail + atomic_read(&ctx->cached_cq_overflow));
+ return inflight >= limit;
+}
+
static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
- struct io_submit_state *state)
+ struct io_submit_state *state, bool force)
{
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
struct io_kiocb *req;
if (!percpu_ref_tryget(&ctx->refs))
- return NULL;
+ return ERR_PTR(-ENXIO);
if (!state) {
+ if (unlikely(!force && io_req_over_limit(ctx)))
+ goto out;
req = kmem_cache_alloc(req_cachep, gfp);
if (unlikely(!req))
goto out;
@@ -722,6 +745,8 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
size_t sz;
int ret;
+ if (unlikely(!force && io_req_over_limit(ctx)))
+ goto out;
sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);
@@ -754,7 +779,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
return req;
out:
percpu_ref_put(&ctx->refs);
- return NULL;
+ return ERR_PTR(-EAGAIN);
}
static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
@@ -2963,10 +2988,11 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
struct io_kiocb *req;
unsigned int sqe_flags;
- req = io_get_req(ctx, statep);
- if (unlikely(!req)) {
+ req = io_get_req(ctx, statep, false);
+ if (unlikely(IS_ERR(req))) {
if (!submitted)
- submitted = -EAGAIN;
+ submitted = PTR_ERR(req);
+ req = NULL;
break;
}
if (!io_get_sqring(ctx, &req->submit)) {
@@ -2986,9 +3012,11 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
if (link && (sqe_flags & IOSQE_IO_DRAIN)) {
if (!shadow_req) {
- shadow_req = io_get_req(ctx, NULL);
- if (unlikely(!shadow_req))
+ shadow_req = io_get_req(ctx, NULL, true);
+ if (unlikely(IS_ERR(shadow_req))) {
+ shadow_req = NULL;
goto out;
+ }
shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN);
refcount_dec(&shadow_req->refs);
}
--
Jens Axboe
^ permalink raw reply related [flat|nested] 11+ messages in thread
* Re: [PATCH RFC] io_uring: limit inflight IO
2019-11-08 0:19 ` Jens Axboe
@ 2019-11-08 9:56 ` Pavel Begunkov
2019-11-08 14:05 ` Jens Axboe
0 siblings, 1 reply; 11+ messages in thread
From: Pavel Begunkov @ 2019-11-08 9:56 UTC (permalink / raw)
To: Jens Axboe, io-uring
[-- Attachment #1.1: Type: text/plain, Size: 5362 bytes --]
On 08/11/2019 03:19, Jens Axboe wrote:
> On 11/7/19 4:21 PM, Jens Axboe wrote:
>> I'd like some feedback on this one. Even tith the overflow backpressure
>> patch, we still have a potentially large gap where applications can
>> submit IO before we get any dropped events in the CQ ring. This is
>> especially true if the execution time of those requests are long
>> (unbounded).
>>
>> This adds IORING_SETUP_INFLIGHT, which if set, will return -EBUSY if we
>> have more IO pending than we can feasibly support. This is normally the
>> CQ ring size, but of IORING_SETUP_CQ_NODROP is enabled, then it's twice
>> the CQ ring size.
>>
>> This helps manage the pending queue size instead of letting it grow
>> indefinitely.
>>
>> Note that we could potentially just make this the default behavior -
>> applications need to handle -EAGAIN returns already, in case we run out
>> of memory, and if we change this to return -EAGAIN as well, then it
>> doesn't introduce any new failure cases. I'm tempted to do that...
>>
>> Anyway, comments solicited!
What's wrong with giving away overflow handling to the userspace? It
knows its inflight count, and it shouldn't be a problem to allocate
large enough rings. The same goes for the backpressure patch. Do you
have a particular requirement/user for this?
Awhile something could be done {efficiently,securely,etc} in the
userspace, I would prefer to keep the kernel part simpler.
>
> After a little deliberation, I think we should go with the one that
> doesn't require users to opt-in. As mentioned, let's change it to
> -EAGAIN to not introduce a new errno for this. They essentially mean
> the same thing anyway.
>
>
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index f8344f95817e..4c488bf6e889 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -203,6 +203,7 @@ struct io_ring_ctx {
> unsigned sq_mask;
> unsigned sq_thread_idle;
> unsigned cached_sq_dropped;
> + atomic_t cached_cq_overflow;
> struct io_uring_sqe *sq_sqes;
>
> struct list_head defer_list;
> @@ -221,13 +222,12 @@ struct io_ring_ctx {
>
> struct {
> unsigned cached_cq_tail;
> - atomic_t cached_cq_overflow;
> unsigned cq_entries;
> unsigned cq_mask;
> + atomic_t cq_timeouts;
> struct wait_queue_head cq_wait;
> struct fasync_struct *cq_fasync;
> struct eventfd_ctx *cq_ev_fd;
> - atomic_t cq_timeouts;
> } ____cacheline_aligned_in_smp;
>
> struct io_rings *rings;
> @@ -705,16 +705,39 @@ static void io_cqring_add_event(struct io_kiocb *req, long res)
> io_cqring_ev_posted(ctx);
> }
>
> +static bool io_req_over_limit(struct io_ring_ctx *ctx)
> +{
> + unsigned limit, inflight;
> +
> + /*
> + * This doesn't need to be super precise, so only check every once
> + * in a while.
> + */
> + if (ctx->cached_sq_head & ctx->sq_mask)
> + return false;
> +
> + if (ctx->flags & IORING_SETUP_CQ_NODROP)
> + limit = 2 * ctx->cq_entries;
> + else
> + limit = ctx->cq_entries;
> +
> + inflight = ctx->cached_sq_head -
> + (ctx->cached_cq_tail + atomic_read(&ctx->cached_cq_overflow));
> + return inflight >= limit;
> +}
> +
> static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
> - struct io_submit_state *state)
> + struct io_submit_state *state, bool force)
> {
> gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
> struct io_kiocb *req;
>
> if (!percpu_ref_tryget(&ctx->refs))
> - return NULL;
> + return ERR_PTR(-ENXIO);
>
> if (!state) {
> + if (unlikely(!force && io_req_over_limit(ctx)))
> + goto out;
> req = kmem_cache_alloc(req_cachep, gfp);
> if (unlikely(!req))
> goto out;
> @@ -722,6 +745,8 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
> size_t sz;
> int ret;
>
> + if (unlikely(!force && io_req_over_limit(ctx)))
> + goto out;
> sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
> ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);
>
> @@ -754,7 +779,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
> return req;
> out:
> percpu_ref_put(&ctx->refs);
> - return NULL;
> + return ERR_PTR(-EAGAIN);
> }
>
> static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
> @@ -2963,10 +2988,11 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
> struct io_kiocb *req;
> unsigned int sqe_flags;
>
> - req = io_get_req(ctx, statep);
> - if (unlikely(!req)) {
> + req = io_get_req(ctx, statep, false);
> + if (unlikely(IS_ERR(req))) {
> if (!submitted)
> - submitted = -EAGAIN;
> + submitted = PTR_ERR(req);
> + req = NULL;
> break;
> }
> if (!io_get_sqring(ctx, &req->submit)) {
> @@ -2986,9 +3012,11 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
>
> if (link && (sqe_flags & IOSQE_IO_DRAIN)) {
> if (!shadow_req) {
> - shadow_req = io_get_req(ctx, NULL);
> - if (unlikely(!shadow_req))
> + shadow_req = io_get_req(ctx, NULL, true);
> + if (unlikely(IS_ERR(shadow_req))) {
> + shadow_req = NULL;
> goto out;
> + }
> shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN);
> refcount_dec(&shadow_req->refs);
> }
>
--
Pavel Begunkov
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH RFC] io_uring: limit inflight IO
2019-11-08 9:56 ` Pavel Begunkov
@ 2019-11-08 14:05 ` Jens Axboe
2019-11-08 17:45 ` Jens Axboe
2019-11-09 10:33 ` Pavel Begunkov
0 siblings, 2 replies; 11+ messages in thread
From: Jens Axboe @ 2019-11-08 14:05 UTC (permalink / raw)
To: Pavel Begunkov, io-uring
On 11/8/19 2:56 AM, Pavel Begunkov wrote:
> On 08/11/2019 03:19, Jens Axboe wrote:
>> On 11/7/19 4:21 PM, Jens Axboe wrote:
>>> I'd like some feedback on this one. Even tith the overflow backpressure
>>> patch, we still have a potentially large gap where applications can
>>> submit IO before we get any dropped events in the CQ ring. This is
>>> especially true if the execution time of those requests are long
>>> (unbounded).
>>>
>>> This adds IORING_SETUP_INFLIGHT, which if set, will return -EBUSY if we
>>> have more IO pending than we can feasibly support. This is normally the
>>> CQ ring size, but of IORING_SETUP_CQ_NODROP is enabled, then it's twice
>>> the CQ ring size.
>>>
>>> This helps manage the pending queue size instead of letting it grow
>>> indefinitely.
>>>
>>> Note that we could potentially just make this the default behavior -
>>> applications need to handle -EAGAIN returns already, in case we run out
>>> of memory, and if we change this to return -EAGAIN as well, then it
>>> doesn't introduce any new failure cases. I'm tempted to do that...
>>>
>>> Anyway, comments solicited!
> What's wrong with giving away overflow handling to the userspace? It
> knows its inflight count, and it shouldn't be a problem to allocate
> large enough rings. The same goes for the backpressure patch. Do you
> have a particular requirement/user for this?
There are basically two things at play here:
- The backpressure patch helps with users that can't easily size the
ring. This could be library use cases, or just normal use cases that
don't necessarily know how big the ring needs to be. Hence they need
to size for the worst case, which is wasteful.
- For this case, it's different. I just want to avoid having the
application having potential tons of requests in flight. Before the
backpressure patch, if you had more than the CQ ring size inflight,
you'd get dropped events. Once you get dropped events, it's game over,
the application has totally lost track. Hence I think it's safe to
unconditionally return -EBUSY if we exceed that limit.
The first one helps use cases that couldn't really io_uring before, the
latter helps protect against use cases that would use a lot of memory.
If the request for the latter use case takes a long time to run, the
problem is even worse.
> Awhile something could be done {efficiently,securely,etc} in the
> userspace, I would prefer to keep the kernel part simpler.
For this particular patch, the majority of issues will just read the sq
head and mask, which we will just now anyway. The extra cost is
basically a branch and cmp instruction. There's no way you can do that
in userspace that efficiently, and it helps protect the kernel.
I'm not a fan of adding stuff we don't need either, but I do believe we
need this one.
--
Jens Axboe
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH RFC] io_uring: limit inflight IO
2019-11-08 14:05 ` Jens Axboe
@ 2019-11-08 17:45 ` Jens Axboe
2019-11-09 11:16 ` Pavel Begunkov
2019-11-09 10:33 ` Pavel Begunkov
1 sibling, 1 reply; 11+ messages in thread
From: Jens Axboe @ 2019-11-08 17:45 UTC (permalink / raw)
To: Pavel Begunkov, io-uring
On 11/8/19 7:05 AM, Jens Axboe wrote:
> On 11/8/19 2:56 AM, Pavel Begunkov wrote:
>> On 08/11/2019 03:19, Jens Axboe wrote:
>>> On 11/7/19 4:21 PM, Jens Axboe wrote:
>>>> I'd like some feedback on this one. Even tith the overflow backpressure
>>>> patch, we still have a potentially large gap where applications can
>>>> submit IO before we get any dropped events in the CQ ring. This is
>>>> especially true if the execution time of those requests are long
>>>> (unbounded).
>>>>
>>>> This adds IORING_SETUP_INFLIGHT, which if set, will return -EBUSY if we
>>>> have more IO pending than we can feasibly support. This is normally the
>>>> CQ ring size, but of IORING_SETUP_CQ_NODROP is enabled, then it's twice
>>>> the CQ ring size.
>>>>
>>>> This helps manage the pending queue size instead of letting it grow
>>>> indefinitely.
>>>>
>>>> Note that we could potentially just make this the default behavior -
>>>> applications need to handle -EAGAIN returns already, in case we run out
>>>> of memory, and if we change this to return -EAGAIN as well, then it
>>>> doesn't introduce any new failure cases. I'm tempted to do that...
>>>>
>>>> Anyway, comments solicited!
>> What's wrong with giving away overflow handling to the userspace? It
>> knows its inflight count, and it shouldn't be a problem to allocate
>> large enough rings. The same goes for the backpressure patch. Do you
>> have a particular requirement/user for this?
>
> There are basically two things at play here:
>
> - The backpressure patch helps with users that can't easily size the
> ring. This could be library use cases, or just normal use cases that
> don't necessarily know how big the ring needs to be. Hence they need
> to size for the worst case, which is wasteful.
>
> - For this case, it's different. I just want to avoid having the
> application having potential tons of requests in flight. Before the
> backpressure patch, if you had more than the CQ ring size inflight,
> you'd get dropped events. Once you get dropped events, it's game over,
> the application has totally lost track. Hence I think it's safe to
> unconditionally return -EBUSY if we exceed that limit.
>
> The first one helps use cases that couldn't really io_uring before, the
> latter helps protect against use cases that would use a lot of memory.
> If the request for the latter use case takes a long time to run, the
> problem is even worse.
>
>> Awhile something could be done {efficiently,securely,etc} in the
>> userspace, I would prefer to keep the kernel part simpler.
>
> For this particular patch, the majority of issues will just read the sq
> head and mask, which we will just now anyway. The extra cost is
> basically a branch and cmp instruction. There's no way you can do that
> in userspace that efficiently, and it helps protect the kernel.
>
> I'm not a fan of adding stuff we don't need either, but I do believe we
> need this one.
I've been struggling a bit with how to make this reliable, and I'm not
so sure there's a way to do that. Let's say an application sets up a
ring with 8 sq entries, which would then default to 16 cq entries. With
this patch, we'd allow 16 ios inflight. But what if the application does
for (i = 0; i < 32; i++) {
sqe = get_sqe();
prep_sqe();
submit_sqe();
}
And then directly proceeds to:
do {
get_completions();
} while (has_completions);
As long as fewer than 16 requests complete before we start reaping,
we don't lose any events. Hence there's a risk of breaking existing
setups with this, even though I don't think that's a high risk.
We probably want to add some sysctl limit for this instead. But then
the question is, what should that entry (or entries) be?
--
Jens Axboe
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH RFC] io_uring: limit inflight IO
2019-11-08 17:45 ` Jens Axboe
@ 2019-11-09 11:16 ` Pavel Begunkov
2019-11-09 14:23 ` Jens Axboe
0 siblings, 1 reply; 11+ messages in thread
From: Pavel Begunkov @ 2019-11-09 11:16 UTC (permalink / raw)
To: Jens Axboe, io-uring
[-- Attachment #1.1: Type: text/plain, Size: 1290 bytes --]
> I've been struggling a bit with how to make this reliable, and I'm not
> so sure there's a way to do that. Let's say an application sets up a
> ring with 8 sq entries, which would then default to 16 cq entries. With
> this patch, we'd allow 16 ios inflight. But what if the application does
>
> for (i = 0; i < 32; i++) {
> sqe = get_sqe();
> prep_sqe();
> submit_sqe();
> }
>
> And then directly proceeds to:
>
> do {
> get_completions();
> } while (has_completions);
>
> As long as fewer than 16 requests complete before we start reaping,
> we don't lose any events. Hence there's a risk of breaking existing
> setups with this, even though I don't think that's a high risk.
>
I think, this should be considered as an erroneous usage of the API.
It's better to fail ASAP than to be surprised in a production
system, because of non-deterministic nature of such code. Even worse
with trying to debug such stuff.
As for me, cases like below are too far-fetched
for (i = 0; i < n; i++)
submit_read_sqe()
for (i = 0; i < n; i++) {
device_allow_next_read()
get_single_cqe()
}
> We probably want to add some sysctl limit for this instead. But then
> the question is, what should that entry (or entries) be?
>
--
Pavel Begunkov
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH RFC] io_uring: limit inflight IO
2019-11-09 11:16 ` Pavel Begunkov
@ 2019-11-09 14:23 ` Jens Axboe
2019-11-09 15:15 ` Jens Axboe
2019-11-09 19:24 ` Pavel Begunkov
0 siblings, 2 replies; 11+ messages in thread
From: Jens Axboe @ 2019-11-09 14:23 UTC (permalink / raw)
To: Pavel Begunkov, io-uring
On 11/9/19 4:16 AM, Pavel Begunkov wrote:
>> I've been struggling a bit with how to make this reliable, and I'm not
>> so sure there's a way to do that. Let's say an application sets up a
>> ring with 8 sq entries, which would then default to 16 cq entries. With
>> this patch, we'd allow 16 ios inflight. But what if the application does
>>
>> for (i = 0; i < 32; i++) {
>> sqe = get_sqe();
>> prep_sqe();
>> submit_sqe();
>> }
>>
>> And then directly proceeds to:
>>
>> do {
>> get_completions();
>> } while (has_completions);
>>
>> As long as fewer than 16 requests complete before we start reaping,
>> we don't lose any events. Hence there's a risk of breaking existing
>> setups with this, even though I don't think that's a high risk.
>>
>
> I think, this should be considered as an erroneous usage of the API.
> It's better to fail ASAP than to be surprised in a production
> system, because of non-deterministic nature of such code. Even worse
> with trying to debug such stuff.
>
> As for me, cases like below are too far-fetched
>
> for (i = 0; i < n; i++)
> submit_read_sqe()
> for (i = 0; i < n; i++) {
> device_allow_next_read()
> get_single_cqe()
> }
I can't really disagree with that, it's a use case that's bound to fail
every now and then...
But if we agree that's the case, then we should be able to just limit
based on the cq ring size in question.
Do we make it different fro CQ_NODROP and !CQ_NODROP or not? Because the
above case would work with CQ_NODROP, reliably. At least CQ_NODROP is
new so we get to set the rules for that one, they just have to make
sense.
--
Jens Axboe
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH RFC] io_uring: limit inflight IO
2019-11-09 14:23 ` Jens Axboe
@ 2019-11-09 15:15 ` Jens Axboe
2019-11-09 19:24 ` Pavel Begunkov
1 sibling, 0 replies; 11+ messages in thread
From: Jens Axboe @ 2019-11-09 15:15 UTC (permalink / raw)
To: Pavel Begunkov, io-uring
On 11/9/19 7:23 AM, Jens Axboe wrote:
> On 11/9/19 4:16 AM, Pavel Begunkov wrote:
>>> I've been struggling a bit with how to make this reliable, and I'm not
>>> so sure there's a way to do that. Let's say an application sets up a
>>> ring with 8 sq entries, which would then default to 16 cq entries. With
>>> this patch, we'd allow 16 ios inflight. But what if the application does
>>>
>>> for (i = 0; i < 32; i++) {
>>> sqe = get_sqe();
>>> prep_sqe();
>>> submit_sqe();
>>> }
>>>
>>> And then directly proceeds to:
>>>
>>> do {
>>> get_completions();
>>> } while (has_completions);
>>>
>>> As long as fewer than 16 requests complete before we start reaping,
>>> we don't lose any events. Hence there's a risk of breaking existing
>>> setups with this, even though I don't think that's a high risk.
>>>
>>
>> I think, this should be considered as an erroneous usage of the API.
>> It's better to fail ASAP than to be surprised in a production
>> system, because of non-deterministic nature of such code. Even worse
>> with trying to debug such stuff.
>>
>> As for me, cases like below are too far-fetched
>>
>> for (i = 0; i < n; i++)
>> submit_read_sqe()
>> for (i = 0; i < n; i++) {
>> device_allow_next_read()
>> get_single_cqe()
>> }
>
> I can't really disagree with that, it's a use case that's bound to fail
> every now and then...
>
> But if we agree that's the case, then we should be able to just limit
> based on the cq ring size in question.
>
> Do we make it different fro CQ_NODROP and !CQ_NODROP or not? Because the
> above case would work with CQ_NODROP, reliably. At least CQ_NODROP is
> new so we get to set the rules for that one, they just have to make
> sense.
Just tossing this one out there, it's an incremental to v2 of the patch.
- Check upfront if we're going over the limit, use the same kind of
cost amortization logic except something that's appropriate for
once-per-batch.
- Fold in with the backpressure -EBUSY logic
This avoids breaking up chains, for example, and also means we don't
have to run these checks for every request.
Limit is > 2 * cq_entries. I think that's liberal enough to not cause
issues, while still having a relation to the sq/cq ring sizes which
I like.
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 18711d45b994..53ccd4e1dee2 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -737,25 +737,6 @@ static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx)
return NULL;
}
-static bool io_req_over_limit(struct io_ring_ctx *ctx)
-{
- unsigned inflight;
-
- /*
- * This doesn't need to be super precise, so only check every once
- * in a while.
- */
- if (ctx->cached_sq_head & ctx->sq_mask)
- return false;
-
- /*
- * Use 2x the max CQ ring size
- */
- inflight = ctx->cached_sq_head -
- (ctx->cached_cq_tail + atomic_read(&ctx->cached_cq_overflow));
- return inflight >= 2 * IORING_MAX_CQ_ENTRIES;
-}
-
static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
struct io_submit_state *state)
{
@@ -766,8 +747,6 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
return ERR_PTR(-ENXIO);
if (!state) {
- if (unlikely(io_req_over_limit(ctx)))
- goto out_limit;
req = kmem_cache_alloc(req_cachep, gfp);
if (unlikely(!req))
goto fallback;
@@ -775,8 +754,6 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
size_t sz;
int ret;
- if (unlikely(io_req_over_limit(ctx)))
- goto out_limit;
sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);
@@ -812,7 +789,6 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
req = io_get_fallback_req(ctx);
if (req)
goto got_it;
-out_limit:
percpu_ref_put(&ctx->refs);
return ERR_PTR(-EBUSY);
}
@@ -3021,6 +2997,30 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
return false;
}
+static bool io_sq_over_limit(struct io_ring_ctx *ctx, unsigned to_submit)
+{
+ unsigned inflight;
+
+ if ((ctx->flags & IORING_SETUP_CQ_NODROP) &&
+ !list_empty(&ctx->cq_overflow_list))
+ return true;
+
+ /*
+ * This doesn't need to be super precise, so only check every once
+ * in a while.
+ */
+ if ((ctx->cached_sq_head & ctx->sq_mask) !=
+ ((ctx->cached_sq_head + to_submit) & ctx->sq_mask))
+ return false;
+
+ /*
+ * Limit us to 2x the CQ ring size
+ */
+ inflight = ctx->cached_sq_head -
+ (ctx->cached_cq_tail + atomic_read(&ctx->cached_cq_overflow));
+ return inflight > 2 * ctx->cq_entries;
+}
+
static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
struct file *ring_file, int ring_fd,
struct mm_struct **mm, bool async)
@@ -3031,8 +3031,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
int i, submitted = 0;
bool mm_fault = false;
- if ((ctx->flags & IORING_SETUP_CQ_NODROP) &&
- !list_empty(&ctx->cq_overflow_list))
+ if (unlikely(io_sq_over_limit(ctx, nr)))
return -EBUSY;
if (nr > IO_PLUG_THRESHOLD) {
--
Jens Axboe
^ permalink raw reply related [flat|nested] 11+ messages in thread
* Re: [PATCH RFC] io_uring: limit inflight IO
2019-11-09 14:23 ` Jens Axboe
2019-11-09 15:15 ` Jens Axboe
@ 2019-11-09 19:24 ` Pavel Begunkov
1 sibling, 0 replies; 11+ messages in thread
From: Pavel Begunkov @ 2019-11-09 19:24 UTC (permalink / raw)
To: Jens Axboe, io-uring
[-- Attachment #1.1: Type: text/plain, Size: 1939 bytes --]
On 09/11/2019 17:23, Jens Axboe wrote:
> On 11/9/19 4:16 AM, Pavel Begunkov wrote:
>>> I've been struggling a bit with how to make this reliable, and I'm not
>>> so sure there's a way to do that. Let's say an application sets up a
>>> ring with 8 sq entries, which would then default to 16 cq entries. With
>>> this patch, we'd allow 16 ios inflight. But what if the application does
>>>
>>> for (i = 0; i < 32; i++) {
>>> sqe = get_sqe();
>>> prep_sqe();
>>> submit_sqe();
>>> }
>>>
>>> And then directly proceeds to:
>>>
>>> do {
>>> get_completions();
>>> } while (has_completions);
>>>
>>> As long as fewer than 16 requests complete before we start reaping,
>>> we don't lose any events. Hence there's a risk of breaking existing
>>> setups with this, even though I don't think that's a high risk.
>>>
>>
>> I think, this should be considered as an erroneous usage of the API.
>> It's better to fail ASAP than to be surprised in a production
>> system, because of non-deterministic nature of such code. Even worse
>> with trying to debug such stuff.
>>
>> As for me, cases like below are too far-fetched
>>
>> for (i = 0; i < n; i++)
>> submit_read_sqe()
>> for (i = 0; i < n; i++) {
>> device_allow_next_read()
>> get_single_cqe()
>> }
>
> I can't really disagree with that, it's a use case that's bound to fail
> every now and then...
>
> But if we agree that's the case, then we should be able to just limit
> based on the cq ring size in question.
>
> Do we make it different fro CQ_NODROP and !CQ_NODROP or not? Because the
> above case would work with CQ_NODROP, reliably. At least CQ_NODROP is
> new so we get to set the rules for that one, they just have to make
> sense.
>
...would work reliably... and also would let user know about
stalling (-EBUSY). I thinks it's a good idea to always do
CQ_NODROP, as in the backlogged patch update.
--
Pavel Begunkov
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH RFC] io_uring: limit inflight IO
2019-11-08 14:05 ` Jens Axboe
2019-11-08 17:45 ` Jens Axboe
@ 2019-11-09 10:33 ` Pavel Begunkov
2019-11-09 14:12 ` Jens Axboe
1 sibling, 1 reply; 11+ messages in thread
From: Pavel Begunkov @ 2019-11-09 10:33 UTC (permalink / raw)
To: Jens Axboe, io-uring
[-- Attachment #1.1: Type: text/plain, Size: 3718 bytes --]
On 08/11/2019 17:05, Jens Axboe wrote:
> On 11/8/19 2:56 AM, Pavel Begunkov wrote:
>> On 08/11/2019 03:19, Jens Axboe wrote:
>>> On 11/7/19 4:21 PM, Jens Axboe wrote:
>>>> I'd like some feedback on this one. Even tith the overflow backpressure
>>>> patch, we still have a potentially large gap where applications can
>>>> submit IO before we get any dropped events in the CQ ring. This is
>>>> especially true if the execution time of those requests are long
>>>> (unbounded).
>>>>
>>>> This adds IORING_SETUP_INFLIGHT, which if set, will return -EBUSY if we
>>>> have more IO pending than we can feasibly support. This is normally the
>>>> CQ ring size, but of IORING_SETUP_CQ_NODROP is enabled, then it's twice
>>>> the CQ ring size.
>>>>
>>>> This helps manage the pending queue size instead of letting it grow
>>>> indefinitely.
>>>>
>>>> Note that we could potentially just make this the default behavior -
>>>> applications need to handle -EAGAIN returns already, in case we run out
>>>> of memory, and if we change this to return -EAGAIN as well, then it
>>>> doesn't introduce any new failure cases. I'm tempted to do that...
>>>>
>>>> Anyway, comments solicited!
>> What's wrong with giving away overflow handling to the userspace? It
>> knows its inflight count, and it shouldn't be a problem to allocate
>> large enough rings. The same goes for the backpressure patch. Do you
>> have a particular requirement/user for this?
>
> There are basically two things at play here:
>
> - The backpressure patch helps with users that can't easily size the
> ring. This could be library use cases, or just normal use cases that
> don't necessarily know how big the ring needs to be. Hence they need
> to size for the worst case, which is wasteful.
>
> - For this case, it's different. I just want to avoid having the
> application having potential tons of requests in flight. Before the
> backpressure patch, if you had more than the CQ ring size inflight,
> you'd get dropped events. Once you get dropped events, it's game over,
> the application has totally lost track. Hence I think it's safe to
> unconditionally return -EBUSY if we exceed that limit.
>
> The first one helps use cases that couldn't really io_uring before, the
> latter helps protect against use cases that would use a lot of memory.
> If the request for the latter use case takes a long time to run, the
> problem is even worse.
I see, thanks. I like the point with having an upper-bound for inflight
memory. Seems, the patch doesn't keep it strict and we can get more
than specified. Is that so?
As an RFC, we could think about using static request pool, e.g.
with sbitmap as in blk-mq. Would also be able to get rid of some
refcounting. However, struggle to estimate performance difference.
>
>> Awhile something could be done {efficiently,securely,etc} in the
>> userspace, I would prefer to keep the kernel part simpler.
>
> For this particular patch, the majority of issues will just read the sq
> head and mask, which we will just now anyway. The extra cost is
> basically a branch and cmp instruction. There's no way you can do that
> in userspace that efficiently, and it helps protect the kernel.
>
I'm more concern about complexity, and bugs as consequences. The second
concern is edge cases and an absence of formalised guarantees for that.
For example, it could fetch and submit only half of a link because of
failed io_get_req(). Shouldn't it be kind of atomic? From the
userspace perspective I'd prefer so.
> I'm not a fan of adding stuff we don't need either, but I do believe we
> need this one.
>
--
Pavel Begunkov
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH RFC] io_uring: limit inflight IO
2019-11-09 10:33 ` Pavel Begunkov
@ 2019-11-09 14:12 ` Jens Axboe
0 siblings, 0 replies; 11+ messages in thread
From: Jens Axboe @ 2019-11-09 14:12 UTC (permalink / raw)
To: Pavel Begunkov, io-uring
On 11/9/19 3:33 AM, Pavel Begunkov wrote:
> On 08/11/2019 17:05, Jens Axboe wrote:
>> On 11/8/19 2:56 AM, Pavel Begunkov wrote:
>>> On 08/11/2019 03:19, Jens Axboe wrote:
>>>> On 11/7/19 4:21 PM, Jens Axboe wrote:
>>>>> I'd like some feedback on this one. Even tith the overflow backpressure
>>>>> patch, we still have a potentially large gap where applications can
>>>>> submit IO before we get any dropped events in the CQ ring. This is
>>>>> especially true if the execution time of those requests are long
>>>>> (unbounded).
>>>>>
>>>>> This adds IORING_SETUP_INFLIGHT, which if set, will return -EBUSY if we
>>>>> have more IO pending than we can feasibly support. This is normally the
>>>>> CQ ring size, but of IORING_SETUP_CQ_NODROP is enabled, then it's twice
>>>>> the CQ ring size.
>>>>>
>>>>> This helps manage the pending queue size instead of letting it grow
>>>>> indefinitely.
>>>>>
>>>>> Note that we could potentially just make this the default behavior -
>>>>> applications need to handle -EAGAIN returns already, in case we run out
>>>>> of memory, and if we change this to return -EAGAIN as well, then it
>>>>> doesn't introduce any new failure cases. I'm tempted to do that...
>>>>>
>>>>> Anyway, comments solicited!
>>> What's wrong with giving away overflow handling to the userspace? It
>>> knows its inflight count, and it shouldn't be a problem to allocate
>>> large enough rings. The same goes for the backpressure patch. Do you
>>> have a particular requirement/user for this?
>>
>> There are basically two things at play here:
>>
>> - The backpressure patch helps with users that can't easily size the
>> ring. This could be library use cases, or just normal use cases that
>> don't necessarily know how big the ring needs to be. Hence they need
>> to size for the worst case, which is wasteful.
>>
>> - For this case, it's different. I just want to avoid having the
>> application having potential tons of requests in flight. Before the
>> backpressure patch, if you had more than the CQ ring size inflight,
>> you'd get dropped events. Once you get dropped events, it's game over,
>> the application has totally lost track. Hence I think it's safe to
>> unconditionally return -EBUSY if we exceed that limit.
>>
>> The first one helps use cases that couldn't really io_uring before, the
>> latter helps protect against use cases that would use a lot of memory.
>> If the request for the latter use case takes a long time to run, the
>> problem is even worse.
>
> I see, thanks. I like the point with having an upper-bound for inflight
> memory. Seems, the patch doesn't keep it strict and we can get more
> than specified. Is that so?
Right, the important part is providing some upper bound, not an exact
one.
> As an RFC, we could think about using static request pool, e.g.
> with sbitmap as in blk-mq. Would also be able to get rid of some
> refcounting. However, struggle to estimate performance difference.
That would be fine if we needed exact limits, for this case it's just be
a waste of CPU with no real win. So I would not advocate that, and it's
not like we need a tagged identifier for it either.
>>> Awhile something could be done {efficiently,securely,etc} in the
>>> userspace, I would prefer to keep the kernel part simpler.
>>
>> For this particular patch, the majority of issues will just read the sq
>> head and mask, which we will just now anyway. The extra cost is
>> basically a branch and cmp instruction. There's no way you can do that
>> in userspace that efficiently, and it helps protect the kernel.
>>
> I'm more concern about complexity, and bugs as consequences. The second
> concern is edge cases and an absence of formalised guarantees for that.
>
> For example, it could fetch and submit only half of a link because of
> failed io_get_req(). Shouldn't it be kind of atomic? From the
> userspace perspective I'd prefer so.
If we start a sequence, we should let it finish, agree. I'll post an
update.
--
Jens Axboe
^ permalink raw reply [flat|nested] 11+ messages in thread
end of thread, other threads:[~2019-11-09 19:25 UTC | newest]
Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2019-11-07 23:21 [PATCH RFC] io_uring: limit inflight IO Jens Axboe
2019-11-08 0:19 ` Jens Axboe
2019-11-08 9:56 ` Pavel Begunkov
2019-11-08 14:05 ` Jens Axboe
2019-11-08 17:45 ` Jens Axboe
2019-11-09 11:16 ` Pavel Begunkov
2019-11-09 14:23 ` Jens Axboe
2019-11-09 15:15 ` Jens Axboe
2019-11-09 19:24 ` Pavel Begunkov
2019-11-09 10:33 ` Pavel Begunkov
2019-11-09 14:12 ` Jens Axboe
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox