On 09/11/2019 18:21, Jens Axboe wrote: > With unbounded request times, we can potentially have a lot of IO > inflight. As we provide no real backpressure unless > IORING_SETUP_CQ_NODROP is set, and even there there's quite some delay > between overflows and backpressure being applied, let's put some safety > in place to avoid going way overboard. > > This limits the maximum number of inflight IO for any given io_ring_ctx > to twice the CQ ring size. This is a losely managed limit, we only check > for every SQ ring size number of events. That should be good enough to > achieve our goal, which is to prevent massively deep queues. If these > are async requests, they would just be waiting for an execution slot > anyway. > > We return -EBUSY if we can't queue anymore IO. The caller should reap > some completions and retry the operation after that. Note that this is > a "should never hit this" kind of condition, as driving the depth into > CQ overflow situations is unreliable. > > Signed-off-by: Jens Axboe > > --- > > Changes since v2: > > - Check upfront if we're going over the limit, use the same kind of > cost amortization logic except something that's appropriate for > once-per-batch. > > - Fold in with the backpressure -EBUSY logic > > - Use twice setup CQ ring size as the rough limit > > diff --git a/fs/io_uring.c b/fs/io_uring.c > index 81457913e9c9..9dd0f5b5e5b2 100644 > --- a/fs/io_uring.c > +++ b/fs/io_uring.c > @@ -744,7 +744,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, > struct io_kiocb *req; > > if (!percpu_ref_tryget(&ctx->refs)) > - return NULL; > + return ERR_PTR(-ENXIO); > > if (!state) { > req = kmem_cache_alloc(req_cachep, gfp); > @@ -790,7 +790,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, > if (req) > goto got_it; > percpu_ref_put(&ctx->refs); > - return NULL; > + return ERR_PTR(-EBUSY); > } > > static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr) > @@ -2992,6 +2992,30 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) > return false; > } > > +static bool io_sq_over_limit(struct io_ring_ctx *ctx, unsigned to_submit) > +{ > + unsigned inflight; > + > + if ((ctx->flags & IORING_SETUP_CQ_NODROP) && > + !list_empty(&ctx->cq_overflow_list)) > + return true; > + > + /* > + * This doesn't need to be super precise, so only check every once > + * in a while. > + */ > + if ((ctx->cached_sq_head & ctx->sq_mask) != > + ((ctx->cached_sq_head + to_submit) & ctx->sq_mask)) > + return false; ctx->sq_mask = sq_entries - 1, e.g. 0x0000...ffff. Thus the code above is modular arithmetic (% sq_entries) and equivalent to: if (to_submit != sq_entries) return false; I suggest, the intention was: if ((sq_head & ~mask) == ((sq_head + to_submit) & ~mask)) return false; > + > + /* > + * Limit us to 2x the CQ ring size > + */ > + inflight = ctx->cached_sq_head - > + (ctx->cached_cq_tail + atomic_read(&ctx->cached_cq_overflow)); > + return inflight > 2 * ctx->cq_entries; > +} > + > static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, > struct file *ring_file, int ring_fd, > struct mm_struct **mm, bool async) > @@ -3002,8 +3026,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, > int i, submitted = 0; > bool mm_fault = false; > > - if ((ctx->flags & IORING_SETUP_CQ_NODROP) && > - !list_empty(&ctx->cq_overflow_list)) > + if (unlikely(io_sq_over_limit(ctx, nr))) > return -EBUSY; > > if (nr > IO_PLUG_THRESHOLD) { > @@ -3016,9 +3039,9 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, > unsigned int sqe_flags; > > req = io_get_req(ctx, statep); > - if (unlikely(!req)) { > + if (unlikely(IS_ERR(req))) { > if (!submitted) > - submitted = -EAGAIN; > + submitted = PTR_ERR(req); > break; > } > if (!io_get_sqring(ctx, &req->submit)) { > @@ -3039,8 +3062,10 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, > if (link && (sqe_flags & IOSQE_IO_DRAIN)) { > if (!shadow_req) { > shadow_req = io_get_req(ctx, NULL); > - if (unlikely(!shadow_req)) > + if (unlikely(IS_ERR(shadow_req))) { > + shadow_req = NULL; > goto out; > + } > shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN); > refcount_dec(&shadow_req->refs); > } > -- Pavel Begunkov