public inbox for [email protected]
 help / color / mirror / Atom feed
From: Olivier Langlois <[email protected]>
To: Hao Xu <[email protected]>, Jens Axboe <[email protected]>,
	[email protected]
Subject: Re: napi_busy_poll
Date: Tue, 15 Feb 2022 03:37:19 -0500	[thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>

On Tue, 2022-02-15 at 01:13 +0800, Hao Xu wrote:
> 
> Yes, it seems that epoll_wait only does busy polling for 1 NAPI.
> 
> I think it is because the busy polling there is just an optimization
> 
> (doing some polling before trapping into sleep) not a must have,
> 
> so it's kind of trade-off between polling and reacting to other
> events
> 
> I guess. Not very sure about this too..
> 
> The iouring implementation I'm thinking of in my mind is polling for
> every
> 
> NAPI involved.
> 
Hao,

I have found the explanation about the epoll oddity:

In:
https://legacy.netdevconf.info/2.1/slides/apr6/dumazet-BUSY-POLLING-Netdev-2.1.pdf

Linux-4.12 changes
epoll() support was added by Sridhar Samudrala and Alexander Duyck,
with the assumption that an application using epoll() and busy polling
would first make sure that it would classify sockets based on their
receive queue (NAPI ID), and use at least one epoll fd per receive
queue.

SO_INCOMING_NAPI_ID was added as a new socket option to retrieve this
information, instead of relying on other mechanisms (CPU or NUMA
identifications).

I have created a small toy implementation with some limitations:
1. It assumes a single napi_id per io_uring ctx like what epoll does
2. It does not detect when pending requests using supporting sockets
are all gone.

That being said, I have not been able to make it work yet. For some
unknown reasons, no valid napi_id is extracted from the sockets added
to the context so the net_busy_poll function is never called.

I find that very strange since prior to use io_uring, my code was using
epoll and the busy polling was working fine with my application
sockets. Something is escaping my comprehension. I must tired and this
will become obvious...

In the meantime, here is what I have created so far. Feel free to play
with it and/or enhance it:

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 77b9c7e4793b..d3deca9b9ef5 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -63,6 +63,7 @@
 #include <net/sock.h>
 #include <net/af_unix.h>
 #include <net/scm.h>
+#include <net/busy_poll.h>
 #include <linux/anon_inodes.h>
 #include <linux/sched/mm.h>
 #include <linux/uaccess.h>
@@ -395,6 +396,10 @@ struct io_ring_ctx {
 	struct list_head	sqd_list;
 
 	unsigned long		check_cq_overflow;
+#ifdef CONFIG_NET_RX_BUSY_POLL
+	/* used to track busy poll napi_id */
+	unsigned int napi_id;
+#endif
 
 	struct {
 		unsigned		cached_cq_tail;
@@ -6976,7 +6981,40 @@ static inline struct file
*io_file_get_fixed(struct io_ring_ctx *ctx,
 	io_req_set_rsrc_node(req, ctx);
 	return file;
 }
+#ifdef CONFIG_NET_RX_BUSY_POLL
+/*
+ * Set epoll busy poll NAPI ID from sk.
+ */
+static inline void io_set_busy_poll_napi_id(struct io_ring_ctx *ctx,
struct file *file)
+{
+	unsigned int napi_id;
+	struct socket *sock;
+	struct sock *sk;
+
+	if (!net_busy_loop_on())
+		return;
 
+	sock = sock_from_file(file);
+	if (!sock)
+		return;
+
+	sk = sock->sk;
+	if (!sk)
+		return;
+
+	napi_id = READ_ONCE(sk->sk_napi_id);
+
+	/* Non-NAPI IDs can be rejected
+	 *	or
+	 * Nothing to do if we already have this ID
+	 */
+	if (napi_id < MIN_NAPI_ID || napi_id == ctx->napi_id)
+		return;
+
+	/* record NAPI ID for use in next busy poll */
+	ctx->napi_id = napi_id;
+}
+#endif
 static struct file *io_file_get_normal(struct io_ring_ctx *ctx,
 				       struct io_kiocb *req, int fd)
 {
@@ -6985,8 +7023,14 @@ static struct file *io_file_get_normal(struct
io_ring_ctx *ctx,
 	trace_io_uring_file_get(ctx, fd);
 
 	/* we don't allow fixed io_uring files */
-	if (file && unlikely(file->f_op == &io_uring_fops))
-		io_req_track_inflight(req);
+	if (file) {
+		if (unlikely(file->f_op == &io_uring_fops))
+			io_req_track_inflight(req);
+#ifdef CONFIG_NET_RX_BUSY_POLL
+		else
+			io_set_busy_poll_napi_id(ctx, file);
+#endif
+	}
 	return file;
 }
 
@@ -7489,7 +7533,22 @@ static inline void
io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
 		   ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP);
 	spin_unlock(&ctx->completion_lock);
 }
+#ifdef CONFIG_NET_RX_BUSY_POLL
+/*
+ * Busy poll if globally on and supporting sockets found
+ */
+static inline bool io_napi_busy_loop(struct io_ring_ctx *ctx)
+{
+	unsigned int napi_id = ctx->napi_id;
 
+	if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) {
+		napi_busy_loop(napi_id, NULL, NULL, true,
+			       BUSY_POLL_BUDGET);
+		return true;
+	}
+	return false;
+}
+#endif
 static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
 {
 	unsigned int to_submit;
@@ -7518,7 +7577,10 @@ static int __io_sq_thread(struct io_ring_ctx
*ctx, bool cap_entries)
 		    !(ctx->flags & IORING_SETUP_R_DISABLED))
 			ret = io_submit_sqes(ctx, to_submit);
 		mutex_unlock(&ctx->uring_lock);
-
+#ifdef CONFIG_NET_RX_BUSY_POLL
+		if (io_napi_busy_loop(ctx))
+			++ret;
+#endif
 		if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))
 			wake_up(&ctx->sqo_sq_wait);
 		if (creds)
@@ -7649,6 +7711,9 @@ struct io_wait_queue {
 	struct io_ring_ctx *ctx;
 	unsigned cq_tail;
 	unsigned nr_timeouts;
+#ifdef CONFIG_NET_RX_BUSY_POLL
+	unsigned busy_poll_to;
+#endif
 };
 
 static inline bool io_should_wake(struct io_wait_queue *iowq)
@@ -7709,6 +7774,29 @@ static inline int io_cqring_wait_schedule(struct
io_ring_ctx *ctx,
 	return !*timeout ? -ETIME : 1;
 }
 
+#ifdef CONFIG_NET_RX_BUSY_POLL
+static inline bool io_busy_loop_timeout(unsigned long start_time,
+					unsigned long bp_usec)
+{
+	if (bp_usec) {
+		unsigned long end_time = start_time + bp_usec;
+		unsigned long now = busy_loop_current_time();
+
+		return time_after(now, end_time);
+	}
+	return true;
+}
+
+static bool io_busy_loop_end(void *p, unsigned long start_time)
+{
+	struct io_wait_queue *iowq = p;
+
+	return io_busy_loop_timeout(start_time, iowq->busy_poll_to) ||
+	       io_run_task_work_sig() ||
+	       io_should_wake(iowq);
+}
+#endif
+
 /*
  * Wait until events become available, if we don't already have some.
The
  * application must reap them itself, as they reside on the shared cq
ring.
@@ -7729,12 +7817,33 @@ static int io_cqring_wait(struct io_ring_ctx
*ctx, int min_events,
 		if (!io_run_task_work())
 			break;
 	} while (1);
-
+#ifdef CONFIG_NET_RX_BUSY_POLL
+	iowq.busy_poll_to = 0;
+#endif
 	if (uts) {
 		struct timespec64 ts;
 
 		if (get_timespec64(&ts, uts))
 			return -EFAULT;
+#ifdef CONFIG_NET_RX_BUSY_POLL
+		if (!(ctx->flags & IORING_SETUP_SQPOLL) &&
+		    (ctx->napi_id >= MIN_NAPI_ID) &&
net_busy_loop_on()) {
+			unsigned busy_poll_to =
+				READ_ONCE(sysctl_net_busy_poll);
+			struct timespec64 pollto =
+				ns_to_timespec64(1000*busy_poll_to);
+
+			if (timespec64_compare(&ts, &pollto) > 0) {
+				ts = timespec64_sub(ts, pollto);
+				iowq.busy_poll_to = busy_poll_to;
+			}
+			else {
+				iowq.busy_poll_to =
timespec64_to_ns(&ts)/1000;
+				ts.tv_sec = 0;
+				ts.tv_nsec = 0;
+			}
+		}
+#endif
 		timeout = timespec64_to_jiffies(&ts);
 	}
 
@@ -7759,6 +7868,11 @@ static int io_cqring_wait(struct io_ring_ctx
*ctx, int min_events,
 	iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
 
 	trace_io_uring_cqring_wait(ctx, min_events);
+#ifdef CONFIG_NET_RX_BUSY_POLL
+	if (iowq.busy_poll_to)
+		napi_busy_loop(ctx->napi_id, io_busy_loop_end, &iowq,
true,
+			       BUSY_POLL_BUDGET);
+#endif
 	do {
 		/* if we can't even flush overflow, don't wait for
more */
 		if (!io_cqring_overflow_flush(ctx)) {

> 

  reply	other threads:[~2022-02-15  8:37 UTC|newest]

Thread overview: 23+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-02-08 14:58 napi_busy_poll Olivier Langlois
2022-02-08 17:05 ` napi_busy_poll Jens Axboe
2022-02-09  3:34   ` napi_busy_poll Hao Xu
2022-02-12 19:51     ` napi_busy_poll Olivier Langlois
2022-02-13 18:47       ` napi_busy_poll Jens Axboe
2022-02-14 17:13       ` napi_busy_poll Hao Xu
2022-02-15  8:37         ` Olivier Langlois [this message]
2022-02-15 18:05           ` napi_busy_poll Olivier Langlois
2022-02-16  3:12             ` napi_busy_poll Hao Xu
2022-02-16 19:19               ` napi_busy_poll Olivier Langlois
2022-02-16 12:14             ` napi_busy_poll Hao Xu
2022-02-17 20:28               ` napi_busy_poll Olivier Langlois
2022-02-18  8:06                 ` napi_busy_poll Hao Xu
2022-02-19  7:14                   ` napi_busy_poll Olivier Langlois
2022-02-21  4:52                     ` napi_busy_poll Hao Xu
2022-02-17 23:18               ` napi_busy_poll Olivier Langlois
2022-02-17 23:25                 ` napi_busy_poll Jens Axboe
2022-02-18  7:21                 ` napi_busy_poll Hao Xu
2022-02-18  5:05               ` napi_busy_poll Olivier Langlois
2022-02-18  7:41                 ` napi_busy_poll Hao Xu
2022-02-19  7:02                   ` napi_busy_poll Olivier Langlois
2022-02-21  5:03                     ` napi_busy_poll Hao Xu
2022-02-25  4:42                       ` napi_busy_poll Olivier Langlois

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=995e65ce3d353cacea4d426c9876b2a5e88faa99.camel@trillion01.com \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox