[RFC] io_uring: avoid ring quiesce while registering/unregistering eventfd

public inbox for [email protected]
 help / color / mirror / Atom feed

* [RFC] io_uring: avoid ring quiesce while registering/unregistering eventfd
@ 2022-02-02 15:59 Usama Arif
  2022-02-02 16:57 ` Jens Axboe
  0 siblings, 1 reply; 8+ messages in thread
From: Usama Arif @ 2022-02-02 15:59 UTC (permalink / raw)
  To: io-uring, axboe, asml.silence, linux-kernel; +Cc: fam.zheng, Usama Arif

Acquire completion_lock at the start of __io_uring_register before
registering/unregistering eventfd and release it at the end. Hence
all calls to io_cqring_ev_posted which adds to the eventfd counter
will finish before acquiring the spin_lock in io_uring_register, and
all new calls will wait till the eventfd is registered. This avoids
ring quiesce which is much more expensive than acquiring the spin_lock.

On the system tested with this patch, io_uring_reigster with
IORING_REGISTER_EVENTFD takes less than 1ms, compared to 15ms before.

Signed-off-by: Usama Arif <[email protected]>
Reviewed-by: Fam Zheng <[email protected]>
---
 fs/io_uring.c | 50 ++++++++++++++++++++++++++++++++++----------------
 1 file changed, 34 insertions(+), 16 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 2e04f718319d..e75d8abd225a 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1803,11 +1803,11 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 			   ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW);
 	}
 
-	if (posted)
+	if (posted) {
 		io_commit_cqring(ctx);
-	spin_unlock(&ctx->completion_lock);
-	if (posted)
 		io_cqring_ev_posted(ctx);
+	}
+	spin_unlock(&ctx->completion_lock);
 	return all_flushed;
 }
 
@@ -1971,8 +1971,8 @@ static void io_req_complete_post(struct io_kiocb *req, s32 res,
 	spin_lock(&ctx->completion_lock);
 	__io_req_complete_post(req, res, cflags);
 	io_commit_cqring(ctx);
-	spin_unlock(&ctx->completion_lock);
 	io_cqring_ev_posted(ctx);
+	spin_unlock(&ctx->completion_lock);
 }
 
 static inline void io_req_complete_state(struct io_kiocb *req, s32 res,
@@ -2231,11 +2231,11 @@ static void __io_req_find_next_prep(struct io_kiocb *req)
 
 	spin_lock(&ctx->completion_lock);
 	posted = io_disarm_next(req);
-	if (posted)
+	if (posted) {
 		io_commit_cqring(ctx);
-	spin_unlock(&ctx->completion_lock);
-	if (posted)
 		io_cqring_ev_posted(ctx);
+	}
+	spin_unlock(&ctx->completion_lock);
 }
 
 static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
@@ -2272,8 +2272,8 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
 static inline void ctx_commit_and_unlock(struct io_ring_ctx *ctx)
 {
 	io_commit_cqring(ctx);
-	spin_unlock(&ctx->completion_lock);
 	io_cqring_ev_posted(ctx);
+	spin_unlock(&ctx->completion_lock);
 }
 
 static void handle_prev_tw_list(struct io_wq_work_node *node,
@@ -2535,8 +2535,8 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 		}
 
 		io_commit_cqring(ctx);
-		spin_unlock(&ctx->completion_lock);
 		io_cqring_ev_posted(ctx);
+		spin_unlock(&ctx->completion_lock);
 		state->flush_cqes = false;
 	}
 
@@ -5541,10 +5541,12 @@ static int io_poll_check_events(struct io_kiocb *req)
 			filled = io_fill_cqe_aux(ctx, req->user_data, mask,
 						 IORING_CQE_F_MORE);
 			io_commit_cqring(ctx);
-			spin_unlock(&ctx->completion_lock);
-			if (unlikely(!filled))
+			if (unlikely(!filled)) {
+				spin_unlock(&ctx->completion_lock);
 				return -ECANCELED;
+			}
 			io_cqring_ev_posted(ctx);
+			spin_unlock(&ctx->completion_lock);
 		} else if (req->result) {
 			return 0;
 		}
@@ -5579,8 +5581,8 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked)
 	hash_del(&req->hash_node);
 	__io_req_complete_post(req, req->result, 0);
 	io_commit_cqring(ctx);
-	spin_unlock(&ctx->completion_lock);
 	io_cqring_ev_posted(ctx);
+	spin_unlock(&ctx->completion_lock);
 }
 
 static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
@@ -8351,8 +8353,8 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
 			spin_lock(&ctx->completion_lock);
 			io_fill_cqe_aux(ctx, prsrc->tag, 0, 0);
 			io_commit_cqring(ctx);
-			spin_unlock(&ctx->completion_lock);
 			io_cqring_ev_posted(ctx);
+			spin_unlock(&ctx->completion_lock);
 			io_ring_submit_unlock(ctx, lock_ring);
 		}
 
@@ -9639,11 +9641,11 @@ static __cold bool io_kill_timeouts(struct io_ring_ctx *ctx,
 		}
 	}
 	spin_unlock_irq(&ctx->timeout_lock);
-	if (canceled != 0)
+	if (canceled != 0) {
 		io_commit_cqring(ctx);
-	spin_unlock(&ctx->completion_lock);
-	if (canceled != 0)
 		io_cqring_ev_posted(ctx);
+	}
+	spin_unlock(&ctx->completion_lock);
 	return canceled != 0;
 }
 
@@ -10970,6 +10972,8 @@ static bool io_register_op_must_quiesce(int op)
 	case IORING_REGISTER_IOWQ_AFF:
 	case IORING_UNREGISTER_IOWQ_AFF:
 	case IORING_REGISTER_IOWQ_MAX_WORKERS:
+	case IORING_REGISTER_EVENTFD:
+	case IORING_UNREGISTER_EVENTFD:
 		return false;
 	default:
 		return true;
@@ -11030,6 +11034,17 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			return -EACCES;
 	}
 
+	/*
+	 * Acquire completion_lock at the start of __io_uring_register before
+	 * registering/unregistering eventfd and release it at the end. Any
+	 * completion events pending before this call will finish before acquiring
+	 * the spin_lock here, and all new completion events will wait till the
+	 * eventfd is registered. This avoids ring quiesce which is much more
+	 * expensive then acquiring spin_lock.
+	 */
+	if (opcode == IORING_REGISTER_EVENTFD || opcode == IORING_UNREGISTER_EVENTFD)
+		spin_lock(&ctx->completion_lock);
+
 	if (io_register_op_must_quiesce(opcode)) {
 		ret = io_ctx_quiesce(ctx);
 		if (ret)
@@ -11141,6 +11156,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 		break;
 	}
 
+	if (opcode == IORING_REGISTER_EVENTFD || opcode == IORING_UNREGISTER_EVENTFD)
+		spin_unlock(&ctx->completion_lock);
+
 	if (io_register_op_must_quiesce(opcode)) {
 		/* bring the ctx back to life */
 		percpu_ref_reinit(&ctx->refs);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [RFC] io_uring: avoid ring quiesce while registering/unregistering eventfd
  2022-02-02 15:59 [RFC] io_uring: avoid ring quiesce while registering/unregistering eventfd Usama Arif
@ 2022-02-02 16:57 ` Jens Axboe
  2022-02-02 18:32   ` Pavel Begunkov
  2022-02-02 19:18   ` Jens Axboe
  0 siblings, 2 replies; 8+ messages in thread
From: Jens Axboe @ 2022-02-02 16:57 UTC (permalink / raw)
  To: Usama Arif, io-uring, asml.silence, linux-kernel; +Cc: fam.zheng

On 2/2/22 8:59 AM, Usama Arif wrote:
> Acquire completion_lock at the start of __io_uring_register before
> registering/unregistering eventfd and release it at the end. Hence
> all calls to io_cqring_ev_posted which adds to the eventfd counter
> will finish before acquiring the spin_lock in io_uring_register, and
> all new calls will wait till the eventfd is registered. This avoids
> ring quiesce which is much more expensive than acquiring the spin_lock.
> 
> On the system tested with this patch, io_uring_reigster with
> IORING_REGISTER_EVENTFD takes less than 1ms, compared to 15ms before.

This seems like optimizing for the wrong thing, so I've got a few
questions. Are you doing a lot of eventfd registrations (and unregister)
in your workload? Or is it just the initial pain of registering one? In
talking to Pavel, he suggested that RCU might be a good use case here,
and I think so too. That would still remove the need to quiesce, and the
posted side just needs a fairly cheap rcu read lock/unlock around it.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC] io_uring: avoid ring quiesce while registering/unregistering eventfd
  2022-02-02 16:57 ` Jens Axboe
@ 2022-02-02 18:32   ` Pavel Begunkov
  2022-02-02 18:39     ` Pavel Begunkov
  2022-02-02 19:18   ` Jens Axboe
  1 sibling, 1 reply; 8+ messages in thread
From: Pavel Begunkov @ 2022-02-02 18:32 UTC (permalink / raw)
  To: Jens Axboe, Usama Arif, io-uring, linux-kernel; +Cc: fam.zheng

On 2/2/22 16:57, Jens Axboe wrote:
> On 2/2/22 8:59 AM, Usama Arif wrote:
>> Acquire completion_lock at the start of __io_uring_register before
>> registering/unregistering eventfd and release it at the end. Hence
>> all calls to io_cqring_ev_posted which adds to the eventfd counter
>> will finish before acquiring the spin_lock in io_uring_register, and
>> all new calls will wait till the eventfd is registered. This avoids
>> ring quiesce which is much more expensive than acquiring the spin_lock.
>>
>> On the system tested with this patch, io_uring_reigster with
>> IORING_REGISTER_EVENTFD takes less than 1ms, compared to 15ms before.
> 
> This seems like optimizing for the wrong thing, so I've got a few
> questions. Are you doing a lot of eventfd registrations (and unregister)
> in your workload? Or is it just the initial pain of registering one? In
> talking to Pavel, he suggested that RCU might be a good use case here,
> and I think so too. That would still remove the need to quiesce, and the
> posted side just needs a fairly cheap rcu read lock/unlock around it.

A bit more context:

1) there is io_cqring_ev_posted_iopoll() which doesn't hold the lock
and adding it will be expensive

2) there is a not posted optimisation for io_cqring_ev_posted() relying
on it being after spin_unlock.

3) we don't want to unnecessarily extend the spinlock section, it's hot

4) there is wake_up_all() inside, so there will be nested locks. That's
bad for perf, but also because of potential deadlocking. E.g. poll
requests do locking in reverse order. There might be more reasons.


But there will be no complaints if you do,

if (evfd) {
	rcu_read_lock();
	eventfd_signal();
	rcu_read_unlock();
}

+ some sync on the registration front

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC] io_uring: avoid ring quiesce while registering/unregistering eventfd
  2022-02-02 18:32   ` Pavel Begunkov
@ 2022-02-02 18:39     ` Pavel Begunkov
  0 siblings, 0 replies; 8+ messages in thread
From: Pavel Begunkov @ 2022-02-02 18:39 UTC (permalink / raw)
  To: Jens Axboe, Usama Arif, io-uring, linux-kernel; +Cc: fam.zheng

On 2/2/22 18:32, Pavel Begunkov wrote:
> On 2/2/22 16:57, Jens Axboe wrote:
>> On 2/2/22 8:59 AM, Usama Arif wrote:
>>> Acquire completion_lock at the start of __io_uring_register before
>>> registering/unregistering eventfd and release it at the end. Hence
>>> all calls to io_cqring_ev_posted which adds to the eventfd counter
>>> will finish before acquiring the spin_lock in io_uring_register, and
>>> all new calls will wait till the eventfd is registered. This avoids
>>> ring quiesce which is much more expensive than acquiring the spin_lock.
>>>
>>> On the system tested with this patch, io_uring_reigster with
>>> IORING_REGISTER_EVENTFD takes less than 1ms, compared to 15ms before.
>>
>> This seems like optimizing for the wrong thing, so I've got a few
>> questions. Are you doing a lot of eventfd registrations (and unregister)
>> in your workload? Or is it just the initial pain of registering one? In
>> talking to Pavel, he suggested that RCU might be a good use case here,
>> and I think so too. That would still remove the need to quiesce, and the
>> posted side just needs a fairly cheap rcu read lock/unlock around it.
> 
> A bit more context:
> 
> 1) there is io_cqring_ev_posted_iopoll() which doesn't hold the lock
> and adding it will be expensive
> 
> 2) there is a not posted optimisation for io_cqring_ev_posted() relying
> on it being after spin_unlock.
> 
> 3) we don't want to unnecessarily extend the spinlock section, it's hot
> 
> 4) there is wake_up_all() inside, so there will be nested locks. That's
> bad for perf, but also because of potential deadlocking. E.g. poll
> requests do locking in reverse order. There might be more reasons.

5) there won't be sync with ctx->cq_ev_fd, and so no rules when
it will start to be visible. Will need to be solved if RCU.

there are probably more caveats, those are off the top of my head


> But there will be no complaints if you do,
> 
> if (evfd) {
>      rcu_read_lock();
>      eventfd_signal();
>      rcu_read_unlock();
> }
> 
> + some sync on the registration front
> 

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC] io_uring: avoid ring quiesce while registering/unregistering eventfd
  2022-02-02 16:57 ` Jens Axboe
  2022-02-02 18:32   ` Pavel Begunkov
@ 2022-02-02 19:18   ` Jens Axboe
  2022-02-03 15:14     ` [External] " Usama Arif
  1 sibling, 1 reply; 8+ messages in thread
From: Jens Axboe @ 2022-02-02 19:18 UTC (permalink / raw)
  To: Usama Arif, io-uring, asml.silence, linux-kernel; +Cc: fam.zheng

On 2/2/22 9:57 AM, Jens Axboe wrote:
> On 2/2/22 8:59 AM, Usama Arif wrote:
>> Acquire completion_lock at the start of __io_uring_register before
>> registering/unregistering eventfd and release it at the end. Hence
>> all calls to io_cqring_ev_posted which adds to the eventfd counter
>> will finish before acquiring the spin_lock in io_uring_register, and
>> all new calls will wait till the eventfd is registered. This avoids
>> ring quiesce which is much more expensive than acquiring the
>> spin_lock.
>>
>> On the system tested with this patch, io_uring_reigster with
>> IORING_REGISTER_EVENTFD takes less than 1ms, compared to 15ms before.
> 
> This seems like optimizing for the wrong thing, so I've got a few
> questions. Are you doing a lot of eventfd registrations (and
> unregister) in your workload? Or is it just the initial pain of
> registering one? In talking to Pavel, he suggested that RCU might be a
> good use case here, and I think so too. That would still remove the
> need to quiesce, and the posted side just needs a fairly cheap rcu
> read lock/unlock around it.

Totally untested, but perhaps can serve as a starting point or
inspiration.


diff --git a/fs/io_uring.c b/fs/io_uring.c
index 64c055421809..195752f4823f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -329,6 +329,12 @@ struct io_submit_state {
 	struct blk_plug		plug;
 };
 
+struct io_ev_fd {
+	struct eventfd_ctx	*cq_ev_fd;
+	struct io_ring_ctx	*ctx;
+	struct rcu_head		rcu;
+};
+
 struct io_ring_ctx {
 	/* const or read-mostly hot data */
 	struct {
@@ -412,7 +418,7 @@ struct io_ring_ctx {
 	struct {
 		unsigned		cached_cq_tail;
 		unsigned		cq_entries;
-		struct eventfd_ctx	*cq_ev_fd;
+		struct io_ev_fd		*io_ev_fd;
 		struct wait_queue_head	cq_wait;
 		unsigned		cq_extra;
 		atomic_t		cq_timeouts;
@@ -1741,13 +1747,27 @@ static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
 
 static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
 {
-	if (likely(!ctx->cq_ev_fd))
+	if (likely(!ctx->io_ev_fd))
 		return false;
 	if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
 		return false;
 	return !ctx->eventfd_async || io_wq_current_is_worker();
 }
 
+static void io_eventfd_signal(struct io_ring_ctx *ctx)
+{
+	struct io_ev_fd *ev_fd;
+
+	if (!io_should_trigger_evfd(ctx))
+		return;
+
+	rcu_read_lock();
+	ev_fd = READ_ONCE(ctx->io_ev_fd);
+	if (ev_fd)
+		eventfd_signal(ev_fd->cq_ev_fd, 1);
+	rcu_read_unlock();
+}
+
 /*
  * This should only get called when at least one event has been posted.
  * Some applications rely on the eventfd notification count only changing
@@ -1764,8 +1784,7 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
 	 */
 	if (wq_has_sleeper(&ctx->cq_wait))
 		wake_up_all(&ctx->cq_wait);
-	if (io_should_trigger_evfd(ctx))
-		eventfd_signal(ctx->cq_ev_fd, 1);
+	io_eventfd_signal(ctx);
 }
 
 static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
@@ -1777,8 +1796,7 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
 		if (waitqueue_active(&ctx->cq_wait))
 			wake_up_all(&ctx->cq_wait);
 	}
-	if (io_should_trigger_evfd(ctx))
-		eventfd_signal(ctx->cq_ev_fd, 1);
+	io_eventfd_signal(ctx);
 }
 
 /* Returns true if there are no backlogged entries after the flush */
@@ -9569,31 +9587,49 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
 
 static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
 {
+	struct io_ev_fd *ev_fd;
 	__s32 __user *fds = arg;
 	int fd;
 
-	if (ctx->cq_ev_fd)
+	if (ctx->io_ev_fd)
 		return -EBUSY;
 
 	if (copy_from_user(&fd, fds, sizeof(*fds)))
 		return -EFAULT;
 
-	ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
-	if (IS_ERR(ctx->cq_ev_fd)) {
-		int ret = PTR_ERR(ctx->cq_ev_fd);
+	ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
+	if (!ev_fd)
+		return -ENOMEM;
+
+	ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
+	if (IS_ERR(ev_fd->cq_ev_fd)) {
+		int ret = PTR_ERR(ev_fd->cq_ev_fd);
 
-		ctx->cq_ev_fd = NULL;
+		kfree(ev_fd);
 		return ret;
 	}
 
+	ev_fd->ctx = ctx;
+	WRITE_ONCE(ctx->io_ev_fd, ev_fd);
 	return 0;
 }
 
+static void io_eventfd_put(struct rcu_head *rcu)
+{
+	struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
+	struct io_ring_ctx *ctx = ev_fd->ctx;
+
+	eventfd_ctx_put(ev_fd->cq_ev_fd);
+	kfree(ev_fd);
+	WRITE_ONCE(ctx->io_ev_fd, NULL);
+}
+
 static int io_eventfd_unregister(struct io_ring_ctx *ctx)
 {
-	if (ctx->cq_ev_fd) {
-		eventfd_ctx_put(ctx->cq_ev_fd);
-		ctx->cq_ev_fd = NULL;
+	struct io_ev_fd *ev_fd = ctx->io_ev_fd;
+
+	if (ev_fd) {
+		call_rcu(&ev_fd->rcu, io_eventfd_put);
 		return 0;
 	}
 
@@ -9659,7 +9695,10 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	if (ctx->rings)
 		__io_cqring_overflow_flush(ctx, true);
 	mutex_unlock(&ctx->uring_lock);
-	io_eventfd_unregister(ctx);
+	if (ctx->io_ev_fd) {
+		eventfd_ctx_put(ctx->io_ev_fd->cq_ev_fd);
+		kfree(ctx->io_ev_fd);
+	}
 	io_destroy_buffers(ctx);
 	if (ctx->sq_creds)
 		put_cred(ctx->sq_creds);
@@ -11209,6 +11248,8 @@ static bool io_register_op_must_quiesce(int op)
 	case IORING_UNREGISTER_IOWQ_AFF:
 	case IORING_REGISTER_IOWQ_MAX_WORKERS:
 	case IORING_REGISTER_MAP_BUFFERS:
+	case IORING_REGISTER_EVENTFD:
+	case IORING_UNREGISTER_EVENTFD:
 		return false;
 	default:
 		return true;
@@ -11423,7 +11464,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
 	ret = __io_uring_register(ctx, opcode, arg, nr_args);
 	mutex_unlock(&ctx->uring_lock);
 	trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
-							ctx->cq_ev_fd != NULL, ret);
+							ctx->io_ev_fd != NULL, ret);
 out_fput:
 	fdput(f);
 	return ret;

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [External] Re: [RFC] io_uring: avoid ring quiesce while registering/unregistering eventfd
  2022-02-02 19:18   ` Jens Axboe
@ 2022-02-03 15:14     ` Usama Arif
  2022-02-03 15:44       ` Pavel Begunkov
  0 siblings, 1 reply; 8+ messages in thread
From: Usama Arif @ 2022-02-03 15:14 UTC (permalink / raw)
  To: Jens Axboe, io-uring, asml.silence, linux-kernel; +Cc: fam.zheng



On 02/02/2022 19:18, Jens Axboe wrote:
> On 2/2/22 9:57 AM, Jens Axboe wrote:
>> On 2/2/22 8:59 AM, Usama Arif wrote:
>>> Acquire completion_lock at the start of __io_uring_register before
>>> registering/unregistering eventfd and release it at the end. Hence
>>> all calls to io_cqring_ev_posted which adds to the eventfd counter
>>> will finish before acquiring the spin_lock in io_uring_register, and
>>> all new calls will wait till the eventfd is registered. This avoids
>>> ring quiesce which is much more expensive than acquiring the
>>> spin_lock.
>>>
>>> On the system tested with this patch, io_uring_reigster with
>>> IORING_REGISTER_EVENTFD takes less than 1ms, compared to 15ms before.
>>
>> This seems like optimizing for the wrong thing, so I've got a few
>> questions. Are you doing a lot of eventfd registrations (and
>> unregister) in your workload? Or is it just the initial pain of
>> registering one? In talking to Pavel, he suggested that RCU might be a
>> good use case here, and I think so too. That would still remove the
>> need to quiesce, and the posted side just needs a fairly cheap rcu
>> read lock/unlock around it.
> 
> Totally untested, but perhaps can serve as a starting point or
> inspiration.
>

Hi,

Thank you for the replies and comments. My usecase registers only one 
eventfd at the start.

Thanks a lot for the diff below, it was a really good starting point!
I have sent a couple of patches for review implementing the RCU way.
I think that the below diff might have some issues, so i have done some 
parts in a different way. Please have a look in the diff below where i 
think there might be issues like race conditions, and how the patches I 
sent resolve it.

I see that if we remove ring quiesce from the the above 3 opcodes, then 
only IORING_REGISTER_ENABLE_RINGS and IORING_REGISTER_RESTRICTIONS is 
left for ring quiesce. I just had a quick look at those, and from what i 
see we might not need to enter ring quiesce in 
IORING_REGISTER_ENABLE_RINGS as the ring is already disabled at that point?
And for IORING_REGISTER_RESTRICTIONS if we do a similar approach to 
IORING_REGISTER_EVENTFD, i.e. wrap ctx->restrictions inside an RCU 
protected data structure, use spin_lock to prevent multiple 
io_register_restrictions calls at the same time, and use read_rcu_lock 
in io_check_restriction, then we can remove ring quiesce from 
io_uring_register altogether?

My usecase only uses IORING_REGISTER_EVENTFD, but i think entering ring 
quiesce costs similar in other opcodes. If the above sounds reasonable, 
please let me know and i can send patches for removing ring quiesce for 
io_uring_register.

Thanks again!
Usama

> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 64c055421809..195752f4823f 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -329,6 +329,12 @@ struct io_submit_state {
>   	struct blk_plug		plug;
>   };
>   
> +struct io_ev_fd {
> +	struct eventfd_ctx	*cq_ev_fd;
> +	struct io_ring_ctx	*ctx;
> +	struct rcu_head		rcu;
> +};
> +
>   struct io_ring_ctx {
>   	/* const or read-mostly hot data */
>   	struct {
> @@ -412,7 +418,7 @@ struct io_ring_ctx {
>   	struct {
>   		unsigned		cached_cq_tail;
>   		unsigned		cq_entries;
> -		struct eventfd_ctx	*cq_ev_fd;
> +		struct io_ev_fd		*io_ev_fd;
>   		struct wait_queue_head	cq_wait;
>   		unsigned		cq_extra;
>   		atomic_t		cq_timeouts;
> @@ -1741,13 +1747,27 @@ static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
>   
>   static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
>   {
> -	if (likely(!ctx->cq_ev_fd))
> +	if (likely(!ctx->io_ev_fd))
>   		return false;
>   	if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
>   		return false;
>   	return !ctx->eventfd_async || io_wq_current_is_worker();
>   }
>   
> +static void io_eventfd_signal(struct io_ring_ctx *ctx)
> +{
> +	struct io_ev_fd *ev_fd;
> +
> +	if (!io_should_trigger_evfd(ctx))
> +		return;
> +

As the above io_should_trigger_evfd is not part of rcu_read_lock in this 
diff, another thread at this point could unregister the eventfd1 that 
was checked in io_should_trigger_evfd call above and register another 
one (eventfd2). If execution switches back to the thread executing 
io_eventfd_signal after this the eventfd_signal below will be sent to 
eventfd2, which is not right. I think there might be other wrong 
scenarios that can happen over here as well.

What i have done to avoid this from happening is treat ctx->io_ev_fd as 
an RCU protected data structure in the entire file. Hence, the entire 
io_eventfd_signal is a read-side critical section and a single ev_fd is
rcu_dereferenced and used in io_should_trigger_evfd and eventfd_signal.


> +	rcu_read_lock();
> +	ev_fd = READ_ONCE(ctx->io_ev_fd);
> +	if (ev_fd)
> +		eventfd_signal(ev_fd->cq_ev_fd, 1);
> +	rcu_read_unlock();
> +}
> +
>   /*
>    * This should only get called when at least one event has been posted.
>    * Some applications rely on the eventfd notification count only changing
> @@ -1764,8 +1784,7 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
>   	 */
>   	if (wq_has_sleeper(&ctx->cq_wait))
>   		wake_up_all(&ctx->cq_wait);
> -	if (io_should_trigger_evfd(ctx))
> -		eventfd_signal(ctx->cq_ev_fd, 1);
> +	io_eventfd_signal(ctx);
>   }
>   
>   static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
> @@ -1777,8 +1796,7 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
>   		if (waitqueue_active(&ctx->cq_wait))
>   			wake_up_all(&ctx->cq_wait);
>   	}
> -	if (io_should_trigger_evfd(ctx))
> -		eventfd_signal(ctx->cq_ev_fd, 1);
> +	io_eventfd_signal(ctx);
>   }
>   
>   /* Returns true if there are no backlogged entries after the flush */
> @@ -9569,31 +9587,49 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
>   
>   static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
>   {
> +	struct io_ev_fd *ev_fd;
>   	__s32 __user *fds = arg;
>   	int fd;
>   
> -	if (ctx->cq_ev_fd)
> +	if (ctx->io_ev_fd)
>   		return -EBUSY;
>

You could have 2 threads call io_uring_register on the same ring at the 
same time, they could both pass the above check of ctx->io_ev_fd != NULL 
not existing and enter a race condition to assign ctx->io_ev_fd, i guess 
thats why locks are used for writes when using RCU, i have used 
ctx->ev_fd_lock in the patch i pushed for review. Also as ctx->io_ev_fd 
is RCU protected so accesses are only using 
rcu_dereference_protected/rcu_dereference/rcu_assign_poitner.


>   	if (copy_from_user(&fd, fds, sizeof(*fds)))
>   		return -EFAULT;
>   
> -	ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
> -	if (IS_ERR(ctx->cq_ev_fd)) {
> -		int ret = PTR_ERR(ctx->cq_ev_fd);
> +	ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
> +	if (!ev_fd)
> +		return -ENOMEM;
> +
> +	ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
> +	if (IS_ERR(ev_fd->cq_ev_fd)) {
> +		int ret = PTR_ERR(ev_fd->cq_ev_fd);
>   
> -		ctx->cq_ev_fd = NULL;
> +		kfree(ev_fd);
>   		return ret;
>   	}
>   
> +	ev_fd->ctx = ctx;
> +	WRITE_ONCE(ctx->io_ev_fd, ev_fd);
>   	return 0;
>   }
>   
> +static void io_eventfd_put(struct rcu_head *rcu)
> +{
> +	struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
> +	struct io_ring_ctx *ctx = ev_fd->ctx;
> +
> +	eventfd_ctx_put(ev_fd->cq_ev_fd);
> +	kfree(ev_fd);
> +	WRITE_ONCE(ctx->io_ev_fd, NULL);
> +}
> +
>   static int io_eventfd_unregister(struct io_ring_ctx *ctx)
>   {
> -	if (ctx->cq_ev_fd) {
> -		eventfd_ctx_put(ctx->cq_ev_fd);
> -		ctx->cq_ev_fd = NULL;
> +	struct io_ev_fd *ev_fd = ctx->io_ev_fd;
> +
> +	if (ev_fd) {
> +		call_rcu(&ev_fd->rcu, io_eventfd_put);
>   		return 0;
>   	}
>   
> @@ -9659,7 +9695,10 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
>   	if (ctx->rings)
>   		__io_cqring_overflow_flush(ctx, true);
>   	mutex_unlock(&ctx->uring_lock);
> -	io_eventfd_unregister(ctx);
> +	if (ctx->io_ev_fd) {
> +		eventfd_ctx_put(ctx->io_ev_fd->cq_ev_fd);
> +		kfree(ctx->io_ev_fd);
> +	}
>   	io_destroy_buffers(ctx);
>   	if (ctx->sq_creds)
>   		put_cred(ctx->sq_creds);
> @@ -11209,6 +11248,8 @@ static bool io_register_op_must_quiesce(int op)
>   	case IORING_UNREGISTER_IOWQ_AFF:
>   	case IORING_REGISTER_IOWQ_MAX_WORKERS:
>   	case IORING_REGISTER_MAP_BUFFERS:
> +	case IORING_REGISTER_EVENTFD:
> +	case IORING_UNREGISTER_EVENTFD:
>   		return false;
>   	default:
>   		return true;
> @@ -11423,7 +11464,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
>   	ret = __io_uring_register(ctx, opcode, arg, nr_args);
>   	mutex_unlock(&ctx->uring_lock);
>   	trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
> -							ctx->cq_ev_fd != NULL, ret);
> +							ctx->io_ev_fd != NULL, ret);
>   out_fput:
>   	fdput(f);
>   	return ret;
> 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [External] Re: [RFC] io_uring: avoid ring quiesce while registering/unregistering eventfd
  2022-02-03 15:14     ` [External] " Usama Arif
@ 2022-02-03 15:44       ` Pavel Begunkov
  2022-02-03 15:55         ` Pavel Begunkov
  0 siblings, 1 reply; 8+ messages in thread
From: Pavel Begunkov @ 2022-02-03 15:44 UTC (permalink / raw)
  To: Usama Arif, Jens Axboe, io-uring, linux-kernel; +Cc: fam.zheng

On 2/3/22 15:14, Usama Arif wrote:
> On 02/02/2022 19:18, Jens Axboe wrote:
>> On 2/2/22 9:57 AM, Jens Axboe wrote:
>>> On 2/2/22 8:59 AM, Usama Arif wrote:
>>>> Acquire completion_lock at the start of __io_uring_register before
>>>> registering/unregistering eventfd and release it at the end. Hence
>>>> all calls to io_cqring_ev_posted which adds to the eventfd counter
>>>> will finish before acquiring the spin_lock in io_uring_register, and
>>>> all new calls will wait till the eventfd is registered. This avoids
>>>> ring quiesce which is much more expensive than acquiring the
>>>> spin_lock.
>>>>
>>>> On the system tested with this patch, io_uring_reigster with
>>>> IORING_REGISTER_EVENTFD takes less than 1ms, compared to 15ms before.
>>>
>>> This seems like optimizing for the wrong thing, so I've got a few
>>> questions. Are you doing a lot of eventfd registrations (and
>>> unregister) in your workload? Or is it just the initial pain of
>>> registering one? In talking to Pavel, he suggested that RCU might be a
>>> good use case here, and I think so too. That would still remove the
>>> need to quiesce, and the posted side just needs a fairly cheap rcu
>>> read lock/unlock around it.
>>
>> Totally untested, but perhaps can serve as a starting point or
>> inspiration.
>>
> 
> Hi,
> 
> Thank you for the replies and comments. My usecase registers only one eventfd at the start.

Then it's overkill. Update io_register_op_must_quiesce(), set ->cq_ev_fd
on registration with WRITE_ONCE(), read it in io_cqring_ev_posted* with
READ_ONCE() and you're set.

There is a caveat, ->cq_ev_fd won't be immediately visible to already
inflight requests, but we can say it's the responsibility of the
userspace to wait for a grace period, i.e. for all inflight requests
submitted before registration io_cqring_ev_posted* might or might not
see updated ->cq_ev_fd, which works perfectly if there was no requests
in the first place. Of course it changes the behaviour so will need
a new register opcode.

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [External] Re: [RFC] io_uring: avoid ring quiesce while registering/unregistering eventfd
  2022-02-03 15:44       ` Pavel Begunkov
@ 2022-02-03 15:55         ` Pavel Begunkov
  0 siblings, 0 replies; 8+ messages in thread
From: Pavel Begunkov @ 2022-02-03 15:55 UTC (permalink / raw)
  To: Usama Arif, Jens Axboe, io-uring, linux-kernel; +Cc: fam.zheng

On 2/3/22 15:44, Pavel Begunkov wrote:
> On 2/3/22 15:14, Usama Arif wrote:
>> On 02/02/2022 19:18, Jens Axboe wrote:
>>> On 2/2/22 9:57 AM, Jens Axboe wrote:
>>>> On 2/2/22 8:59 AM, Usama Arif wrote:
>>>>> Acquire completion_lock at the start of __io_uring_register before
>>>>> registering/unregistering eventfd and release it at the end. Hence
>>>>> all calls to io_cqring_ev_posted which adds to the eventfd counter
>>>>> will finish before acquiring the spin_lock in io_uring_register, and
>>>>> all new calls will wait till the eventfd is registered. This avoids
>>>>> ring quiesce which is much more expensive than acquiring the
>>>>> spin_lock.
>>>>>
>>>>> On the system tested with this patch, io_uring_reigster with
>>>>> IORING_REGISTER_EVENTFD takes less than 1ms, compared to 15ms before.
>>>>
>>>> This seems like optimizing for the wrong thing, so I've got a few
>>>> questions. Are you doing a lot of eventfd registrations (and
>>>> unregister) in your workload? Or is it just the initial pain of
>>>> registering one? In talking to Pavel, he suggested that RCU might be a
>>>> good use case here, and I think so too. That would still remove the
>>>> need to quiesce, and the posted side just needs a fairly cheap rcu
>>>> read lock/unlock around it.
>>>
>>> Totally untested, but perhaps can serve as a starting point or
>>> inspiration.
>>>
>>
>> Hi,
>>
>> Thank you for the replies and comments. My usecase registers only one eventfd at the start.
> 
> Then it's overkill. Update io_register_op_must_quiesce(), set ->cq_ev_fd
> on registration with WRITE_ONCE(), read it in io_cqring_ev_posted* with
> READ_ONCE() and you're set.

Actually needs smp_store_release/smp_load_acquire


> There is a caveat, ->cq_ev_fd won't be immediately visible to already
> inflight requests, but we can say it's the responsibility of the
> userspace to wait for a grace period, i.e. for all inflight requests
> submitted before registration io_cqring_ev_posted* might or might not
> see updated ->cq_ev_fd, which works perfectly if there was no requests
> in the first place. Of course it changes the behaviour so will need
> a new register opcode.
> 

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2022-02-03 15:59 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2022-02-02 15:59 [RFC] io_uring: avoid ring quiesce while registering/unregistering eventfd Usama Arif
2022-02-02 16:57 ` Jens Axboe
2022-02-02 18:32   ` Pavel Begunkov
2022-02-02 18:39     ` Pavel Begunkov
2022-02-02 19:18   ` Jens Axboe
2022-02-03 15:14     ` [External] " Usama Arif
2022-02-03 15:44       ` Pavel Begunkov
2022-02-03 15:55         ` Pavel Begunkov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox