public inbox for [email protected]
 help / color / mirror / Atom feed
* [PATCH 0/2] abstract napi tracking strategy
@ 2024-08-13 16:44 Olivier Langlois
  2024-08-13 17:10 ` [PATCH 1/2] io_uring/napi: Introduce io_napi_tracking_ops Olivier Langlois
                   ` (2 more replies)
  0 siblings, 3 replies; 16+ messages in thread
From: Olivier Langlois @ 2024-08-13 16:44 UTC (permalink / raw)
  To: Jens Axboe, Pavel Begunkov, io-uring

the actual napi tracking strategy is inducing a non-negligeable overhead.
Everytime a multishot poll is triggered or any poll armed, if the napi is
enabled on the ring a lookup is performed to either add a new napi id into
the napi_list or its timeout value is updated.

For many scenarios, this is overkill as the napi id list will be pretty
much static most of the time. To address this common scenario, a new
abstraction has been created following the common Linux kernel idiom of
creating an abstract interface with a struct filled with function pointers.

Creating an alternate napi tracking strategy is therefore made in 2 phases.

1. Introduce the io_napi_tracking_ops interface
2. Implement a static napi tracking by defining a new io_napi_tracking_ops

Olivier Langlois (2):
  io_uring/napi: Introduce io_napi_tracking_ops
  io_uring/napi: add static napi tracking strategy

 include/linux/io_uring_types.h |  12 +-
 include/uapi/linux/io_uring.h  |  30 ++++-
 io_uring/fdinfo.c              |   4 +
 io_uring/napi.c                | 226 +++++++++++++++++++++++++++++----
 io_uring/napi.h                |  11 +-
 5 files changed, 243 insertions(+), 40 deletions(-)

-- 
2.46.0


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 1/2] io_uring/napi: Introduce io_napi_tracking_ops
  2024-08-13 16:44 [PATCH 0/2] abstract napi tracking strategy Olivier Langlois
@ 2024-08-13 17:10 ` Olivier Langlois
  2024-08-14 11:44   ` Olivier Langlois
  2024-08-13 17:11 ` [PATCH 2/2] io_uring/napi: add static napi tracking strategy Olivier Langlois
  2024-08-13 18:33 ` [PATCH 0/2] abstract " Jens Axboe
  2 siblings, 1 reply; 16+ messages in thread
From: Olivier Langlois @ 2024-08-13 17:10 UTC (permalink / raw)
  To: Jens Axboe, Pavel Begunkov, io-uring

The long term goal is to lay out a framework to be able to offer
different napi tracking strategies to the user. The obvious first
alternative strategy is the static tracking where the user would update
manually the napi_list to remove the overhead made by io_uring managing the
list dynamically.

Signed-off-by: Olivier Langlois <[email protected]>
---
 include/linux/io_uring_types.h | 12 +++++-
 io_uring/fdinfo.c              |  4 ++
 io_uring/napi.c                | 76 ++++++++++++++++++++++++++++++----
 io_uring/napi.h                | 11 +----
 4 files changed, 86 insertions(+), 17 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 3315005df117..c1d1b28f8cca 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -217,6 +217,16 @@ struct io_alloc_cache {
 	size_t			elem_size;
 };
 
+#ifdef CONFIG_NET_RX_BUSY_POLL
+struct io_napi_tracking_ops {
+	void (*add_id)(struct io_kiocb *req);
+	bool (*do_busy_loop)(struct io_ring_ctx *ctx,
+			     void *loop_end_arg);
+	void (*show_fdinfo)(struct io_ring_ctx *ctx,
+			    struct seq_file *m);
+};
+#endif
+
 struct io_ring_ctx {
 	/* const or read-mostly hot data */
 	struct {
@@ -402,11 +412,11 @@ struct io_ring_ctx {
 #ifdef CONFIG_NET_RX_BUSY_POLL
 	struct list_head	napi_list;	/* track busy poll napi_id */
 	spinlock_t		napi_lock;	/* napi_list lock */
+	struct io_napi_tracking_ops *napi_ops;
 
 	/* napi busy poll default timeout */
 	ktime_t			napi_busy_poll_dt;
 	bool			napi_prefer_busy_poll;
-	bool			napi_enabled;
 
 	DECLARE_HASHTABLE(napi_ht, 4);
 #endif
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index b1e0e0d85349..fa773687a684 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -223,5 +223,9 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
 	}
 
 	spin_unlock(&ctx->completion_lock);
+#ifdef CONFIG_NET_RX_BUSY_POLL
+	ctx->napi_ops->show_fdinfo(ctx, m);
+#endif
+
 }
 #endif
diff --git a/io_uring/napi.c b/io_uring/napi.c
index 1de1d4d62925..75ac850af0c0 100644
--- a/io_uring/napi.c
+++ b/io_uring/napi.c
@@ -38,7 +38,7 @@ static inline ktime_t net_to_ktime(unsigned long t)
 	return ns_to_ktime(t << 10);
 }
 
-void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock)
+static inline void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock)
 {
 	struct hlist_head *hash_list;
 	unsigned int napi_id;
@@ -136,8 +136,52 @@ static bool io_napi_busy_loop_should_end(void *data,
 	return false;
 }
 
-static bool __io_napi_do_busy_loop(struct io_ring_ctx *ctx,
-				   void *loop_end_arg)
+/*
+ * does not perform any busy polling but still check if list entries are
+ * stalled if the list is not empty. This could happen by unregistering
+ * napi after having enabled it for some time.
+ */
+static bool no_tracking_do_busy_loop(struct io_ring_ctx *ctx,
+				     void *loop_end_arg)
+{
+	struct io_napi_entry *e;
+	bool is_stale = false;
+
+	list_for_each_entry_rcu(e, &ctx->napi_list, list) {
+		if (time_after(jiffies, e->timeout))
+			is_stale = true;
+	}
+
+	return is_stale;
+}
+
+static void no_tracking_show_fdinfo(struct io_ring_ctx *ctx,
+				    struct seq_file *m)
+{
+	seq_puts(m, "NAPI:\tdisabled\n");
+}
+
+/*
+ * default ops for a newly created ring for which NAPI busy poll is not enabled
+ */
+static struct io_napi_tracking_ops no_tracking_ops = {
+	.add_id = NULL,
+	.do_busy_loop = no_tracking_do_busy_loop,
+	.show_fdinfo = no_tracking_show_fdinfo,
+};
+
+static void dynamic_tracking_add_id(struct io_kiocb *req)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	struct socket *sock;
+
+	sock = sock_from_file(req->file);
+	if (sock)
+		__io_napi_add(ctx, sock);
+}
+
+static bool dynamic_tracking_do_busy_loop(struct io_ring_ctx *ctx,
+					  void *loop_end_arg)
 {
 	struct io_napi_entry *e;
 	bool (*loop_end)(void *, unsigned long) = NULL;
@@ -157,6 +201,23 @@ static bool __io_napi_do_busy_loop(struct io_ring_ctx *ctx,
 	return is_stale;
 }
 
+static void dynamic_tracking_show_fdinfo(struct io_ring_ctx *ctx,
+					 struct seq_file *m)
+{
+	seq_puts(m, "NAPI:\tenabled\n");
+	seq_printf(m, "napi_busy_poll_to:\t%u\n", ctx->napi_busy_poll_to);
+	if (ctx->napi_prefer_busy_poll)
+		seq_puts(m, "napi_prefer_busy_poll:\ttrue\n");
+	else
+		seq_puts(m, "napi_prefer_busy_poll:\tfalse\n");
+}
+
+static struct io_napi_tracking_ops dynamic_tracking_ops = {
+	.add_id = dynamic_tracking_add_id,
+	.do_busy_loop = dynamic_tracking_do_busy_loop,
+	.show_fdinfo = dynamic_tracking_show_fdinfo,
+};
+
 static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx,
 				       struct io_wait_queue *iowq)
 {
@@ -172,7 +233,7 @@ static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx,
 
 	rcu_read_lock();
 	do {
-		is_stale = __io_napi_do_busy_loop(ctx, loop_end_arg);
+		is_stale = ctx->napi_ops->do_busy_loop(ctx, loop_end_arg);
 	} while (!io_napi_busy_loop_should_end(iowq, start_time) && !loop_end_arg);
 	rcu_read_unlock();
 
@@ -193,6 +254,7 @@ void io_napi_init(struct io_ring_ctx *ctx)
 	spin_lock_init(&ctx->napi_lock);
 	ctx->napi_prefer_busy_poll = false;
 	ctx->napi_busy_poll_dt = ns_to_ktime(sys_dt);
+	ctx->napi_ops = &no_tracking_ops;
 }
 
 /*
@@ -241,7 +303,7 @@ int io_register_napi(struct io_ring_ctx *ctx, void __user *arg)
 
 	WRITE_ONCE(ctx->napi_busy_poll_dt, napi.busy_poll_to * NSEC_PER_USEC);
 	WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi.prefer_busy_poll);
-	WRITE_ONCE(ctx->napi_enabled, true);
+	WRITE_ONCE(ctx->napi_ops, &dynamic_tracking_ops);
 	return 0;
 }
 
@@ -265,7 +327,7 @@ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg)
 
 	WRITE_ONCE(ctx->napi_busy_poll_dt, 0);
 	WRITE_ONCE(ctx->napi_prefer_busy_poll, false);
-	WRITE_ONCE(ctx->napi_enabled, false);
+	WRITE_ONCE(ctx->napi_ops, &no_tracking_ops);
 	return 0;
 }
 
@@ -321,7 +383,7 @@ int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx)
 		return 0;
 
 	rcu_read_lock();
-	is_stale = __io_napi_do_busy_loop(ctx, NULL);
+	is_stale = ctx->napi_ops->do_busy_loop(ctx, NULL);
 	rcu_read_unlock();
 
 	io_napi_remove_stale(ctx, is_stale);
diff --git a/io_uring/napi.h b/io_uring/napi.h
index 27b88c3eb428..3d68d8e7b108 100644
--- a/io_uring/napi.h
+++ b/io_uring/napi.h
@@ -15,8 +15,6 @@ void io_napi_free(struct io_ring_ctx *ctx);
 int io_register_napi(struct io_ring_ctx *ctx, void __user *arg);
 int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg);
 
-void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock);
-
 void __io_napi_adjust_timeout(struct io_ring_ctx *ctx,
 		struct io_wait_queue *iowq, ktime_t to_wait);
 void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq);
@@ -53,14 +51,9 @@ static inline void io_napi_busy_loop(struct io_ring_ctx *ctx,
 static inline void io_napi_add(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-	struct socket *sock;
-
-	if (!READ_ONCE(ctx->napi_enabled))
-		return;
 
-	sock = sock_from_file(req->file);
-	if (sock)
-		__io_napi_add(ctx, sock);
+	if (ctx->napi_ops->add_id)
+		ctx->napi_ops->add_id(req);
 }
 
 #else
-- 
2.46.0


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 2/2] io_uring/napi: add static napi tracking strategy
  2024-08-13 16:44 [PATCH 0/2] abstract napi tracking strategy Olivier Langlois
  2024-08-13 17:10 ` [PATCH 1/2] io_uring/napi: Introduce io_napi_tracking_ops Olivier Langlois
@ 2024-08-13 17:11 ` Olivier Langlois
  2024-08-13 18:33 ` [PATCH 0/2] abstract " Jens Axboe
  2 siblings, 0 replies; 16+ messages in thread
From: Olivier Langlois @ 2024-08-13 17:11 UTC (permalink / raw)
  To: Jens Axboe, Pavel Begunkov, io-uring

add the static napi tracking strategy that allows the user to manually
manage the napi ids list to busy poll and offload the ring from
dynamically update the list.

Signed-off-by: Olivier Langlois <[email protected]>
---
 include/uapi/linux/io_uring.h |  30 ++++++-
 io_uring/napi.c               | 162 ++++++++++++++++++++++++++++------
 2 files changed, 163 insertions(+), 29 deletions(-)

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 2aaf7ee256ac..f72471b19af2 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -728,12 +728,38 @@ struct io_uring_buf_status {
 	__u32	resv[8];
 };
 
+enum io_uring_napi_op {
+	/* register/ungister backward compatible opcode */
+	IO_URING_NAPI_REGISTER_OP = 0,
+
+	/* opcodes to update napi_list when static tracking is used */
+	IO_URING_NAPI_STATIC_ADD_ID = 1,
+	IO_URING_NAPI_STATIC_DEL_ID = 2
+};
+
+enum io_uring_napi_tracking_strategy {
+	IO_URING_NAPI_TRACKING_DYNAMIC = 0,
+	IO_URING_NAPI_TRACKING_STATIC = 1
+};
+
 /* argument for IORING_(UN)REGISTER_NAPI */
 struct io_uring_napi {
 	__u32	busy_poll_to;
 	__u8	prefer_busy_poll;
-	__u8	pad[3];
-	__u64	resv;
+
+	/* a io_uring_napi_op value */
+	__u8	opcode;
+	__u8	pad[2];
+
+	/*
+	 * for IO_URING_NAPI_REGISTER_OP, it is a
+	 * io_uring_napi_tracking_strategy value.
+	 *
+	 * for IO_URING_NAPI_STATIC_ADD_ID/IO_URING_NAPI_STATIC_DEL_ID
+	 * it is the napi id to add/del from napi_list.
+	 */
+	__u32	op_param;
+	__u32	resv;
 };
 
 /*
diff --git a/io_uring/napi.c b/io_uring/napi.c
index 75ac850af0c0..b66ff15fcc72 100644
--- a/io_uring/napi.c
+++ b/io_uring/napi.c
@@ -38,37 +38,29 @@ static inline ktime_t net_to_ktime(unsigned long t)
 	return ns_to_ktime(t << 10);
 }
 
-static inline void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock)
+static int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id)
 {
 	struct hlist_head *hash_list;
-	unsigned int napi_id;
-	struct sock *sk;
 	struct io_napi_entry *e;
 
-	sk = sock->sk;
-	if (!sk)
-		return;
-
-	napi_id = READ_ONCE(sk->sk_napi_id);
-
 	/* Non-NAPI IDs can be rejected. */
 	if (napi_id < MIN_NAPI_ID)
-		return;
+		return -EINVAL;
 
 	hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))];
 
 	rcu_read_lock();
 	e = io_napi_hash_find(hash_list, napi_id);
 	if (e) {
-		e->timeout = jiffies + NAPI_TIMEOUT;
+		WRITE_ONCE(e->timeout, jiffies + NAPI_TIMEOUT);
 		rcu_read_unlock();
-		return;
+		return -EEXIST;
 	}
 	rcu_read_unlock();
 
 	e = kmalloc(sizeof(*e), GFP_NOWAIT);
 	if (!e)
-		return;
+		return -ENOMEM;
 
 	e->napi_id = napi_id;
 	e->timeout = jiffies + NAPI_TIMEOUT;
@@ -77,23 +69,62 @@ static inline void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock)
 	if (unlikely(io_napi_hash_find(hash_list, napi_id))) {
 		spin_unlock(&ctx->napi_lock);
 		kfree(e);
-		return;
+		return -EEXIST;
 	}
 
 	hlist_add_tail_rcu(&e->node, hash_list);
-	list_add_tail(&e->list, &ctx->napi_list);
+	list_add_tail_rcu(&e->list, &ctx->napi_list);
 	spin_unlock(&ctx->napi_lock);
+	return 0;
+}
+
+static int __io_napi_del_id(struct io_ring_ctx *ctx, unsigned int napi_id)
+{
+	struct hlist_head *hash_list;
+	struct io_napi_entry *e;
+
+	/* Non-NAPI IDs can be rejected. */
+	if (napi_id < MIN_NAPI_ID)
+		return -EINVAL;
+
+	hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))];
+	spin_lock(&ctx->napi_lock);
+	e = io_napi_hash_find(hash_list, napi_id);
+	if (unlikely(!e)) {
+		spin_unlock(&ctx->napi_lock);
+		return -ENOENT;
+	}
+
+	list_del_rcu(&e->list);
+	hash_del_rcu(&e->node);
+	kfree_rcu(e, rcu);
+	spin_unlock(&ctx->napi_lock);
+	return 0;
+}
+
+static inline void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock)
+{
+	unsigned int napi_id;
+	struct sock *sk;
+
+	sk = sock->sk;
+	if (!sk)
+		return;
+
+	napi_id = READ_ONCE(sk->sk_napi_id);
+	__io_napi_add_id(ctx, napi_id);
 }
 
 static void __io_napi_remove_stale(struct io_ring_ctx *ctx)
 {
+	struct hlist_node *tmp;
 	struct io_napi_entry *e;
 	unsigned int i;
 
 	spin_lock(&ctx->napi_lock);
-	hash_for_each(ctx->napi_ht, i, e, node) {
+	hash_for_each_safe(ctx->napi_ht, i, tmp, e, node) {
 		if (time_after(jiffies, e->timeout)) {
-			list_del(&e->list);
+			list_del_rcu(&e->list);
 			hash_del_rcu(&e->node);
 			kfree_rcu(e, rcu);
 		}
@@ -201,23 +232,68 @@ static bool dynamic_tracking_do_busy_loop(struct io_ring_ctx *ctx,
 	return is_stale;
 }
 
-static void dynamic_tracking_show_fdinfo(struct io_ring_ctx *ctx,
-					 struct seq_file *m)
+static void common_tracking_show_fdinfo(struct io_ring_ctx *ctx,
+					struct seq_file *m,
+					const char *tracking_strategy)
 {
 	seq_puts(m, "NAPI:\tenabled\n");
-	seq_printf(m, "napi_busy_poll_to:\t%u\n", ctx->napi_busy_poll_to);
+	seq_printf(m, "napi tracking:\t%s\n", tracking_strategy);
+	seq_printf(m, "napi_busy_poll_to:\t%llu\n", ctx->napi_busy_poll_dt);
 	if (ctx->napi_prefer_busy_poll)
 		seq_puts(m, "napi_prefer_busy_poll:\ttrue\n");
 	else
 		seq_puts(m, "napi_prefer_busy_poll:\tfalse\n");
 }
 
+static void dynamic_tracking_show_fdinfo(struct io_ring_ctx *ctx,
+					 struct seq_file *m)
+{
+	common_tracking_show_fdinfo(ctx, m, "dynamic");
+}
+
 static struct io_napi_tracking_ops dynamic_tracking_ops = {
 	.add_id = dynamic_tracking_add_id,
 	.do_busy_loop = dynamic_tracking_do_busy_loop,
-	.show_fdinfo = dynamic_tracking_show_fdinfo,
+	.show_fdinfo  = dynamic_tracking_show_fdinfo,
+};
+
+/*
+ * never report stale entries
+ */
+static bool static_tracking_do_busy_loop(struct io_ring_ctx *ctx,
+					 void *loop_end_arg)
+{
+	struct io_napi_entry *e;
+	bool (*loop_end)(void *, unsigned long) = NULL;
+
+	if (loop_end_arg)
+		loop_end = io_napi_busy_loop_should_end;
+
+	list_for_each_entry_rcu(e, &ctx->napi_list, list) {
+		napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg,
+				   ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET);
+	}
+
+	return false;
+}
+
+static void static_tracking_show_fdinfo(struct io_ring_ctx *ctx,
+					struct seq_file *m)
+{
+	common_tracking_show_fdinfo(ctx, m, "static");
+}
+
+static struct io_napi_tracking_ops static_tracking_ops = {
+	.add_id = NULL,
+	.do_busy_loop = static_tracking_do_busy_loop,
+	.show_fdinfo = static_tracking_show_fdinfo,
 };
 
+static inline u32 io_napi_get_tracking(struct io_ring_ctx *ctx)
+{
+	return ctx->napi_ops == &static_tracking_ops;
+}
+
 static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx,
 				       struct io_wait_queue *iowq)
 {
@@ -273,9 +349,30 @@ void io_napi_free(struct io_ring_ctx *ctx)
 		hash_del_rcu(&e->node);
 		kfree_rcu(e, rcu);
 	}
+	INIT_LIST_HEAD_RCU(&ctx->napi_list);
 	spin_unlock(&ctx->napi_lock);
 }
 
+static int io_napi_register_napi(struct io_ring_ctx *ctx,
+				 struct io_uring_napi *napi)
+{
+	switch (napi->op_param) {
+	case IO_URING_NAPI_TRACKING_DYNAMIC:
+		WRITE_ONCE(ctx->napi_ops, &dynamic_tracking_ops);
+		break;
+	case IO_URING_NAPI_TRACKING_STATIC:
+		WRITE_ONCE(ctx->napi_ops, &static_tracking_ops);
+		/* clean the napi list for manual setup */
+		io_napi_free(ctx);
+		break;
+	default:
+		return -EINVAL;
+	}
+	WRITE_ONCE(ctx->napi_busy_poll_dt, napi->busy_poll_to * NSEC_PER_USEC);
+	WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi->prefer_busy_poll);
+	return 0;
+}
+
 /*
  * io_napi_register() - Register napi with io-uring
  * @ctx: pointer to io-uring context structure
@@ -287,7 +384,8 @@ int io_register_napi(struct io_ring_ctx *ctx, void __user *arg)
 {
 	const struct io_uring_napi curr = {
 		.busy_poll_to 	  = ktime_to_us(ctx->napi_busy_poll_dt),
-		.prefer_busy_poll = ctx->napi_prefer_busy_poll
+		.prefer_busy_poll = ctx->napi_prefer_busy_poll,
+		.op_param	  = io_napi_get_tracking(ctx)
 	};
 	struct io_uring_napi napi;
 
@@ -295,16 +393,26 @@ int io_register_napi(struct io_ring_ctx *ctx, void __user *arg)
 		return -EINVAL;
 	if (copy_from_user(&napi, arg, sizeof(napi)))
 		return -EFAULT;
-	if (napi.pad[0] || napi.pad[1] || napi.pad[2] || napi.resv)
+	if (napi.pad[0] || napi.pad[1] || napi.resv)
 		return -EINVAL;
 
 	if (copy_to_user(arg, &curr, sizeof(curr)))
 		return -EFAULT;
 
-	WRITE_ONCE(ctx->napi_busy_poll_dt, napi.busy_poll_to * NSEC_PER_USEC);
-	WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi.prefer_busy_poll);
-	WRITE_ONCE(ctx->napi_ops, &dynamic_tracking_ops);
-	return 0;
+	switch (napi.opcode) {
+	case IO_URING_NAPI_REGISTER_OP:
+		return io_napi_register_napi(ctx, &napi);
+	case IO_URING_NAPI_STATIC_ADD_ID:
+		if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC)
+			return -EINVAL;
+		return __io_napi_add_id(ctx, napi.op_param);
+	case IO_URING_NAPI_STATIC_DEL_ID:
+		if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC)
+			return -EINVAL;
+		return __io_napi_del_id(ctx, napi.op_param);
+	default:
+		return -EINVAL;
+	}
 }
 
 /*
-- 
2.46.0


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2] abstract napi tracking strategy
  2024-08-13 16:44 [PATCH 0/2] abstract napi tracking strategy Olivier Langlois
  2024-08-13 17:10 ` [PATCH 1/2] io_uring/napi: Introduce io_napi_tracking_ops Olivier Langlois
  2024-08-13 17:11 ` [PATCH 2/2] io_uring/napi: add static napi tracking strategy Olivier Langlois
@ 2024-08-13 18:33 ` Jens Axboe
  2024-08-13 21:25   ` Olivier Langlois
  2024-08-13 21:34   ` Olivier Langlois
  2 siblings, 2 replies; 16+ messages in thread
From: Jens Axboe @ 2024-08-13 18:33 UTC (permalink / raw)
  To: Olivier Langlois, Pavel Begunkov, io-uring

On 8/13/24 10:44 AM, Olivier Langlois wrote:
> the actual napi tracking strategy is inducing a non-negligeable overhead.
> Everytime a multishot poll is triggered or any poll armed, if the napi is
> enabled on the ring a lookup is performed to either add a new napi id into
> the napi_list or its timeout value is updated.
> 
> For many scenarios, this is overkill as the napi id list will be pretty
> much static most of the time. To address this common scenario, a new
> abstraction has been created following the common Linux kernel idiom of
> creating an abstract interface with a struct filled with function pointers.
> 
> Creating an alternate napi tracking strategy is therefore made in 2 phases.
> 
> 1. Introduce the io_napi_tracking_ops interface
> 2. Implement a static napi tracking by defining a new io_napi_tracking_ops

I don't think we should create ops for this, unless there's a strict
need to do so. Indirect function calls aren't cheap, and the CPU side
mitigations for security issues made them worse.

You're not wrong that ops is not an uncommon idiom in the kernel, but
it's a lot less prevalent as a solution than it used to. Exactly because
of the above reasons.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2] abstract napi tracking strategy
  2024-08-13 18:33 ` [PATCH 0/2] abstract " Jens Axboe
@ 2024-08-13 21:25   ` Olivier Langlois
  2024-08-13 21:44     ` Jens Axboe
  2024-08-13 22:36     ` Pavel Begunkov
  2024-08-13 21:34   ` Olivier Langlois
  1 sibling, 2 replies; 16+ messages in thread
From: Olivier Langlois @ 2024-08-13 21:25 UTC (permalink / raw)
  To: Jens Axboe, Pavel Begunkov, io-uring

On Tue, 2024-08-13 at 12:33 -0600, Jens Axboe wrote:
> On 8/13/24 10:44 AM, Olivier Langlois wrote:
> > the actual napi tracking strategy is inducing a non-negligeable
> > overhead.
> > Everytime a multishot poll is triggered or any poll armed, if the
> > napi is
> > enabled on the ring a lookup is performed to either add a new napi
> > id into
> > the napi_list or its timeout value is updated.
> > 
> > For many scenarios, this is overkill as the napi id list will be
> > pretty
> > much static most of the time. To address this common scenario, a
> > new
> > abstraction has been created following the common Linux kernel
> > idiom of
> > creating an abstract interface with a struct filled with function
> > pointers.
> > 
> > Creating an alternate napi tracking strategy is therefore made in 2
> > phases.
> > 
> > 1. Introduce the io_napi_tracking_ops interface
> > 2. Implement a static napi tracking by defining a new
> > io_napi_tracking_ops
> 
> I don't think we should create ops for this, unless there's a strict
> need to do so. Indirect function calls aren't cheap, and the CPU side
> mitigations for security issues made them worse.
> 
> You're not wrong that ops is not an uncommon idiom in the kernel, but
> it's a lot less prevalent as a solution than it used to. Exactly
> because
> of the above reasons.
> 
ok. Do you have a reference explaining this?
and what type of construct would you use instead?

AFAIK, a big performance killer is the branch mispredictions coming
from big switch/case or if/else if/else blocks and it was precisely the
reason why you removed the big switch/case io_uring was having with
function pointers in io_issue_def...

I consumme an enormous amount of programming learning material daily
and this is the first time that I am hearing this.

If there was a performance concern about this type of construct and
considering that my main programming language is C++, I am bit
surprised that I have not seen anything about some problems with C++
vtbls...

but oh well, I am learning new stuff everyday, so please share the
references you have about the topic so that I can perfect my knowledge.

thank you,


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2] abstract napi tracking strategy
  2024-08-13 18:33 ` [PATCH 0/2] abstract " Jens Axboe
  2024-08-13 21:25   ` Olivier Langlois
@ 2024-08-13 21:34   ` Olivier Langlois
  2024-08-13 21:45     ` Jens Axboe
  1 sibling, 1 reply; 16+ messages in thread
From: Olivier Langlois @ 2024-08-13 21:34 UTC (permalink / raw)
  To: Jens Axboe, Pavel Begunkov, io-uring

On Tue, 2024-08-13 at 12:33 -0600, Jens Axboe wrote:
> On 8/13/24 10:44 AM, Olivier Langlois wrote:
> > the actual napi tracking strategy is inducing a non-negligeable
> > overhead.
> > Everytime a multishot poll is triggered or any poll armed, if the
> > napi is
> > enabled on the ring a lookup is performed to either add a new napi
> > id into
> > the napi_list or its timeout value is updated.
> > 
> > For many scenarios, this is overkill as the napi id list will be
> > pretty
> > much static most of the time. To address this common scenario, a
> > new
> > abstraction has been created following the common Linux kernel
> > idiom of
> > creating an abstract interface with a struct filled with function
> > pointers.
> > 
> > Creating an alternate napi tracking strategy is therefore made in 2
> > phases.
> > 
> > 1. Introduce the io_napi_tracking_ops interface
> > 2. Implement a static napi tracking by defining a new
> > io_napi_tracking_ops
> 
> I don't think we should create ops for this, unless there's a strict
> need to do so. Indirect function calls aren't cheap, and the CPU side
> mitigations for security issues made them worse.
> 
> You're not wrong that ops is not an uncommon idiom in the kernel, but
> it's a lot less prevalent as a solution than it used to. Exactly
> because
> of the above reasons.
> 
if indirection is a very big deal, it might be possible to remove one
level of indirection.

I did entertain the idea of making copies of the io_napi_tracking_ops
structs instead of storing their addresses. I did not kept this option
because of the way that I did implement io_napi_get_tracking()...

but if this would be an acceptable compromise, this is definitely
something possible.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2] abstract napi tracking strategy
  2024-08-13 21:25   ` Olivier Langlois
@ 2024-08-13 21:44     ` Jens Axboe
  2024-08-15 22:17       ` Olivier Langlois
  2024-08-13 22:36     ` Pavel Begunkov
  1 sibling, 1 reply; 16+ messages in thread
From: Jens Axboe @ 2024-08-13 21:44 UTC (permalink / raw)
  To: Olivier Langlois, Pavel Begunkov, io-uring

On 8/13/24 3:25 PM, Olivier Langlois wrote:
> On Tue, 2024-08-13 at 12:33 -0600, Jens Axboe wrote:
>> On 8/13/24 10:44 AM, Olivier Langlois wrote:
>>> the actual napi tracking strategy is inducing a non-negligeable
>>> overhead.
>>> Everytime a multishot poll is triggered or any poll armed, if the
>>> napi is
>>> enabled on the ring a lookup is performed to either add a new napi
>>> id into
>>> the napi_list or its timeout value is updated.
>>>
>>> For many scenarios, this is overkill as the napi id list will be
>>> pretty
>>> much static most of the time. To address this common scenario, a
>>> new
>>> abstraction has been created following the common Linux kernel
>>> idiom of
>>> creating an abstract interface with a struct filled with function
>>> pointers.
>>>
>>> Creating an alternate napi tracking strategy is therefore made in 2
>>> phases.
>>>
>>> 1. Introduce the io_napi_tracking_ops interface
>>> 2. Implement a static napi tracking by defining a new
>>> io_napi_tracking_ops
>>
>> I don't think we should create ops for this, unless there's a strict
>> need to do so. Indirect function calls aren't cheap, and the CPU side
>> mitigations for security issues made them worse.
>>
>> You're not wrong that ops is not an uncommon idiom in the kernel, but
>> it's a lot less prevalent as a solution than it used to. Exactly
>> because
>> of the above reasons.
>>
> ok. Do you have a reference explaining this?
> and what type of construct would you use instead?

See all the spectre nonsense, and the mitigations that followed from
that.

> AFAIK, a big performance killer is the branch mispredictions coming
> from big switch/case or if/else if/else blocks and it was precisely the
> reason why you removed the big switch/case io_uring was having with
> function pointers in io_issue_def...

For sure, which is why io_uring itself ended up using indirect function
calls, because the table just became unwieldy. But that's a different
case from adding it for just a single case, or two. For those, branch
prediction should be fine, as it would always have the same outcome.

> I consumme an enormous amount of programming learning material daily
> and this is the first time that I am hearing this.

The kernel and backend programming are a bit different in that regard,
for better or for worse.

> If there was a performance concern about this type of construct and
> considering that my main programming language is C++, I am bit
> surprised that I have not seen anything about some problems with C++
> vtbls...

It's definitely slower than a direct function call, regardless of
whether this is in the kernel or not. Can be mitigated by having the
common case be predicted with a branch. See INDIRECT_CALL_*() in the
kernel.

> but oh well, I am learning new stuff everyday, so please share the
> references you have about the topic so that I can perfect my knowledge.

I think lwn had a recent thing on indirect function calls as it pertains
to the security modules, I'd check that first. But the spectre thing
above is likely all you need!

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2] abstract napi tracking strategy
  2024-08-13 21:34   ` Olivier Langlois
@ 2024-08-13 21:45     ` Jens Axboe
  0 siblings, 0 replies; 16+ messages in thread
From: Jens Axboe @ 2024-08-13 21:45 UTC (permalink / raw)
  To: Olivier Langlois, Pavel Begunkov, io-uring

On 8/13/24 3:34 PM, Olivier Langlois wrote:
> On Tue, 2024-08-13 at 12:33 -0600, Jens Axboe wrote:
>> On 8/13/24 10:44 AM, Olivier Langlois wrote:
>>> the actual napi tracking strategy is inducing a non-negligeable
>>> overhead.
>>> Everytime a multishot poll is triggered or any poll armed, if the
>>> napi is
>>> enabled on the ring a lookup is performed to either add a new napi
>>> id into
>>> the napi_list or its timeout value is updated.
>>>
>>> For many scenarios, this is overkill as the napi id list will be
>>> pretty
>>> much static most of the time. To address this common scenario, a
>>> new
>>> abstraction has been created following the common Linux kernel
>>> idiom of
>>> creating an abstract interface with a struct filled with function
>>> pointers.
>>>
>>> Creating an alternate napi tracking strategy is therefore made in 2
>>> phases.
>>>
>>> 1. Introduce the io_napi_tracking_ops interface
>>> 2. Implement a static napi tracking by defining a new
>>> io_napi_tracking_ops
>>
>> I don't think we should create ops for this, unless there's a strict
>> need to do so. Indirect function calls aren't cheap, and the CPU side
>> mitigations for security issues made them worse.
>>
>> You're not wrong that ops is not an uncommon idiom in the kernel, but
>> it's a lot less prevalent as a solution than it used to. Exactly
>> because
>> of the above reasons.
>>
> if indirection is a very big deal, it might be possible to remove one
> level of indirection.

It's not that it's a huge deal, it's just more that if we're dealing
with a single abstraction, then I think it's somewhat overdesigning for
the use case. And I'd prefer to avoid that.

> I did entertain the idea of making copies of the io_napi_tracking_ops
> structs instead of storing their addresses. I did not kept this option
> because of the way that I did implement io_napi_get_tracking()...
> 
> but if this would be an acceptable compromise, this is definitely
> something possible.

Doesn't really change it, I think. See above.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2] abstract napi tracking strategy
  2024-08-13 21:25   ` Olivier Langlois
  2024-08-13 21:44     ` Jens Axboe
@ 2024-08-13 22:36     ` Pavel Begunkov
  2024-08-14 13:28       ` Pavel Begunkov
  1 sibling, 1 reply; 16+ messages in thread
From: Pavel Begunkov @ 2024-08-13 22:36 UTC (permalink / raw)
  To: Olivier Langlois, Jens Axboe, io-uring

On 8/13/24 22:25, Olivier Langlois wrote:
> On Tue, 2024-08-13 at 12:33 -0600, Jens Axboe wrote:
>> On 8/13/24 10:44 AM, Olivier Langlois wrote:
>>> the actual napi tracking strategy is inducing a non-negligeable
>>> overhead.
>>> Everytime a multishot poll is triggered or any poll armed, if the
>>> napi is
>>> enabled on the ring a lookup is performed to either add a new napi
>>> id into
>>> the napi_list or its timeout value is updated.
>>>
>>> For many scenarios, this is overkill as the napi id list will be
>>> pretty
>>> much static most of the time. To address this common scenario, a
>>> new
>>> abstraction has been created following the common Linux kernel
>>> idiom of
>>> creating an abstract interface with a struct filled with function
>>> pointers.
>>>
>>> Creating an alternate napi tracking strategy is therefore made in 2
>>> phases.
>>>
>>> 1. Introduce the io_napi_tracking_ops interface
>>> 2. Implement a static napi tracking by defining a new
>>> io_napi_tracking_ops
>>
>> I don't think we should create ops for this, unless there's a strict
>> need to do so. Indirect function calls aren't cheap, and the CPU side
>> mitigations for security issues made them worse.
>>
>> You're not wrong that ops is not an uncommon idiom in the kernel, but
>> it's a lot less prevalent as a solution than it used to. Exactly
>> because
>> of the above reasons.
>>
> ok. Do you have a reference explaining this?
> and what type of construct would you use instead?
> 
> AFAIK, a big performance killer is the branch mispredictions coming
> from big switch/case or if/else if/else blocks and it was precisely the
> reason why you removed the big switch/case io_uring was having with
> function pointers in io_issue_def...

Compilers can optimise switch-case very well, look up what jump
tables is, often works even better than indirect functions even
without mitigations. And it wasn't converted because of performance,
it was a nice efficient jump table before.

And not like compilers can devirtualise indirect calls either, I'd
say it hits the pipeline even harder. Maybe not as hard as a long
if-else-if in the final binary, but jump tables help and we're
talking about a single "if".

I totally agree, it's way over engineered.

> I consumme an enormous amount of programming learning material daily
> and this is the first time that I am hearing this.
> 
> If there was a performance concern about this type of construct and
> considering that my main programming language is C++, I am bit
> surprised that I have not seen anything about some problems with C++
> vtbls...

Even without mitigation business, we can look up a lot about
devirtualisation, which is also why "final" keyword exists in c++.

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/2] io_uring/napi: Introduce io_napi_tracking_ops
  2024-08-13 17:10 ` [PATCH 1/2] io_uring/napi: Introduce io_napi_tracking_ops Olivier Langlois
@ 2024-08-14 11:44   ` Olivier Langlois
  2024-08-14 13:17     ` Jens Axboe
  0 siblings, 1 reply; 16+ messages in thread
From: Olivier Langlois @ 2024-08-14 11:44 UTC (permalink / raw)
  To: Jens Axboe, Pavel Begunkov, io-uring

On Tue, 2024-08-13 at 13:10 -0400, Olivier Langlois wrote:
> 
> ---
>  include/linux/io_uring_types.h | 12 +++++-
>  io_uring/fdinfo.c              |  4 ++
>  io_uring/napi.c                | 76 ++++++++++++++++++++++++++++++--
> --
>  io_uring/napi.h                | 11 +----
>  4 files changed, 86 insertions(+), 17 deletions(-)
> 
> diff --git a/include/linux/io_uring_types.h
> b/include/linux/io_uring_types.h
> index 3315005df117..c1d1b28f8cca 100644
> --- a/include/linux/io_uring_types.h
> +++ b/include/linux/io_uring_types.h
> @@ -217,6 +217,16 @@ struct io_alloc_cache {
>  	size_t			elem_size;
>  };
>  
> +#ifdef CONFIG_NET_RX_BUSY_POLL
> +struct io_napi_tracking_ops {
> +	void (*add_id)(struct io_kiocb *req);
> +	bool (*do_busy_loop)(struct io_ring_ctx *ctx,
> +			     void *loop_end_arg);
> +	void (*show_fdinfo)(struct io_ring_ctx *ctx,
> +			    struct seq_file *m);
> +};
> +#endif
> +
I have kept thinking about the critic...

add_id is either NULL or equal to dynamic_tracking_add_id and the
pointer is even tested before calling it. This pointer is easily
removed.

show_fdinfo, well, this is is so unimportant, if you don't like it, it
is very easily removable too. nobody will notice.

the only thing that would remains is do_busy_loop. Its value can either
be:

- no_tracking_do_busy_loop
- dynamic_tracking_do_busy_loop
- static_tracking_do_busy_loop

so the whole io_napi_tracking_ops could be replaced by a single
function pointer

At this point, the only thing remaining point to determine is which
between calling a function pointer of calling a 2 conditional branches
code is more efficient. and I am of the opinion that the function
pointer is better due to my C++ background but this is debatable...


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/2] io_uring/napi: Introduce io_napi_tracking_ops
  2024-08-14 11:44   ` Olivier Langlois
@ 2024-08-14 13:17     ` Jens Axboe
  0 siblings, 0 replies; 16+ messages in thread
From: Jens Axboe @ 2024-08-14 13:17 UTC (permalink / raw)
  To: Olivier Langlois, Pavel Begunkov, io-uring

On 8/14/24 5:44 AM, Olivier Langlois wrote:
> At this point, the only thing remaining point to determine is which
> between calling a function pointer of calling a 2 conditional branches
> code is more efficient. and I am of the opinion that the function
> pointer is better due to my C++ background but this is debatable...

As mentioned earlier, your C++ background for systems programming isn't
super relevant here. Even without mitigations, a perfectly predictable
branch is surely faster than an indirect function call. And with
mitigations, it's not even close.

It's a single handler, just do the branches. I don't think this is worth
fretting over.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2] abstract napi tracking strategy
  2024-08-13 22:36     ` Pavel Begunkov
@ 2024-08-14 13:28       ` Pavel Begunkov
  0 siblings, 0 replies; 16+ messages in thread
From: Pavel Begunkov @ 2024-08-14 13:28 UTC (permalink / raw)
  To: Olivier Langlois, Jens Axboe, io-uring

On 8/13/24 23:36, Pavel Begunkov wrote:
> On 8/13/24 22:25, Olivier Langlois wrote:
>> On Tue, 2024-08-13 at 12:33 -0600, Jens Axboe wrote:
>>> On 8/13/24 10:44 AM, Olivier Langlois wrote:
>>>> the actual napi tracking strategy is inducing a non-negligeable
>>>> overhead.
>>>> Everytime a multishot poll is triggered or any poll armed, if the
>>>> napi is
>>>> enabled on the ring a lookup is performed to either add a new napi
>>>> id into
>>>> the napi_list or its timeout value is updated.
>>>>
>>>> For many scenarios, this is overkill as the napi id list will be
>>>> pretty
>>>> much static most of the time. To address this common scenario, a
>>>> new
>>>> abstraction has been created following the common Linux kernel
>>>> idiom of
>>>> creating an abstract interface with a struct filled with function
>>>> pointers.
>>>>
>>>> Creating an alternate napi tracking strategy is therefore made in 2
>>>> phases.
>>>>
>>>> 1. Introduce the io_napi_tracking_ops interface
>>>> 2. Implement a static napi tracking by defining a new
>>>> io_napi_tracking_ops
>>>
>>> I don't think we should create ops for this, unless there's a strict
>>> need to do so. Indirect function calls aren't cheap, and the CPU side
>>> mitigations for security issues made them worse.
>>>
>>> You're not wrong that ops is not an uncommon idiom in the kernel, but
>>> it's a lot less prevalent as a solution than it used to. Exactly
>>> because
>>> of the above reasons.
>>>
>> ok. Do you have a reference explaining this?
>> and what type of construct would you use instead?
>>
>> AFAIK, a big performance killer is the branch mispredictions coming
>> from big switch/case or if/else if/else blocks and it was precisely the
>> reason why you removed the big switch/case io_uring was having with
>> function pointers in io_issue_def...
> 
> Compilers can optimise switch-case very well, look up what jump
> tables is, often works even better than indirect functions even
> without mitigations. And it wasn't converted because of performance,
> it was a nice efficient jump table before.

Correction, switches were optimisable until -fno-jump-tables was
added because of speculations, makes it equal with indirect calls,
and otherwise there were usually retpolined making it equal to
indirect calls.

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2] abstract napi tracking strategy
  2024-08-13 21:44     ` Jens Axboe
@ 2024-08-15 22:17       ` Olivier Langlois
  2024-08-15 22:44         ` Olivier Langlois
  0 siblings, 1 reply; 16+ messages in thread
From: Olivier Langlois @ 2024-08-15 22:17 UTC (permalink / raw)
  To: Jens Axboe, Pavel Begunkov, io-uring

On Tue, 2024-08-13 at 15:44 -0600, Jens Axboe wrote:
> On 8/13/24 3:25 PM, Olivier Langlois wrote:
> > On Tue, 2024-08-13 at 12:33 -0600, Jens Axboe wrote:
> > > On 8/13/24 10:44 AM, Olivier Langlois wrote:
> > > > the actual napi tracking strategy is inducing a non-negligeable
> > > > overhead.
> > > > Everytime a multishot poll is triggered or any poll armed, if
> > > > the
> > > > napi is
> > > > enabled on the ring a lookup is performed to either add a new
> > > > napi
> > > > id into
> > > > the napi_list or its timeout value is updated.
> > > > 
> > > > For many scenarios, this is overkill as the napi id list will
> > > > be
> > > > pretty
> > > > much static most of the time. To address this common scenario,
> > > > a
> > > > new
> > > > abstraction has been created following the common Linux kernel
> > > > idiom of
> > > > creating an abstract interface with a struct filled with
> > > > function
> > > > pointers.
> > > > 
> > > > Creating an alternate napi tracking strategy is therefore made
> > > > in 2
> > > > phases.
> > > > 
> > > > 1. Introduce the io_napi_tracking_ops interface
> > > > 2. Implement a static napi tracking by defining a new
> > > > io_napi_tracking_ops
> > > 
> > > I don't think we should create ops for this, unless there's a
> > > strict
> > > need to do so. Indirect function calls aren't cheap, and the CPU
> > > side
> > > mitigations for security issues made them worse.
> > > 
> > > You're not wrong that ops is not an uncommon idiom in the kernel,
> > > but
> > > it's a lot less prevalent as a solution than it used to. Exactly
> > > because
> > > of the above reasons.
> > > 
> > ok. Do you have a reference explaining this?
> > and what type of construct would you use instead?
> 
> See all the spectre nonsense, and the mitigations that followed from
> that.
> 
> > AFAIK, a big performance killer is the branch mispredictions coming
> > from big switch/case or if/else if/else blocks and it was precisely
> > the
> > reason why you removed the big switch/case io_uring was having with
> > function pointers in io_issue_def...
> 
> For sure, which is why io_uring itself ended up using indirect
> function
> calls, because the table just became unwieldy. But that's a different
> case from adding it for just a single case, or two. For those, branch
> prediction should be fine, as it would always have the same outcome.
> 
> > I consumme an enormous amount of programming learning material
> > daily
> > and this is the first time that I am hearing this.
> 
> The kernel and backend programming are a bit different in that
> regard,
> for better or for worse.
> 
> > If there was a performance concern about this type of construct and
> > considering that my main programming language is C++, I am bit
> > surprised that I have not seen anything about some problems with
> > C++
> > vtbls...
> 
> It's definitely slower than a direct function call, regardless of
> whether this is in the kernel or not. Can be mitigated by having the
> common case be predicted with a branch. See INDIRECT_CALL_*() in the
> kernel.
> 
> > but oh well, I am learning new stuff everyday, so please share the
> > references you have about the topic so that I can perfect my
> > knowledge.
> 
> I think lwn had a recent thing on indirect function calls as it
> pertains
> to the security modules, I'd check that first. But the spectre thing
> above is likely all you need!
> 

Jens,

thx a lot about the clarifications. I will for sure investigate these
leads to better understand your fear of function callbacks...

I have little interest about Spectre and other mitigations and security
in general so I know very little about those topics.

A guy, that I value a lot his knowledge is Chandler Carruth from Google
who made a talk about Spectre in 2018:
https://www.youtube.com/watch?v=_f7O3IfIR2k

I will rewatch his talk, check LWN about the indirect functions calls
INDIRECT_CALL() macros that you mention...

AFAIK, the various kernel mitigations are mostly applied when
transiting from Kernel mode back to userspace.

because otherwise the compiled code for a usespace program would be
pretty much identical than kernel compiled code.

To my eyes, what is really important, it is that absolute best
technical solution is choosen and the only way that this discussion can
be done, it is with numbers. So I have created a small created a small
benchmark program to compare a function pointer indirect call vs
selecting a function in a 3 branches if/else if/else block. Here are
the results:

----------------------------------------------------------
Benchmark                Time             CPU   Iterations
----------------------------------------------------------
BM_test_virtual      0.628 ns        0.627 ns    930255515
BM_test_ifElse        1.59 ns         1.58 ns    446805050

add this result with my concession in
https://lore.kernel.org/io-uring/[email protected]/

that you are right for 2 of the function pointers out of the 3
functions of io_napi_tracking_ops...

Hopefully this discussion will lead us toward the best solution.

keep in mind that this point is a minuscule issue.

if you prefer the 2.5x slower construct for any reason that you do not
have to justify, I'll accept your decision and rework my proposal to go
your way.

I believe that offering some form of NAPI static tracking is still a
very interesting feature no matter what is the outcome for this very
minor technical issue.

code:
/*
 * virtual overhead vs if/else google benchmark
 *
 * Olivier Langlois - August 15, 2024
 *
 * compile cmd:
 * g++ -std=c++26 -I.. -pthread -Wall -g -O3 -pipe
 * -fno-omit-frame-pointer bench_virtual.cpp -lbenchmark -o
bench_virtual
 */

#include "benchmark/benchmark.h"

/*
 * CppCon 2015: Chandler Carruth "Tuning C++: Benchmarks, and CPUs, and
 * Compilers! Oh My!
 * https://www.youtube.com/watch?v=nXaxk27zwlk
 */
static void escape(void *p)
{
    asm volatile("" : : "g"(p) : "memory");
}

bool no_tracking_do_busy_loop()
{
    int res{0};

    escape(&res);
    return res;
}

bool dynamic_tracking_do_busy_loop()
{
    int res{1};

    escape(&res);
    return res;
}

bool static_tracking_do_busy_loop()
{
    int res{2};

    escape(&res);
    return res;
}

class io_napi_tracking_ops
{
public:
    virtual bool do_busy_loop() noexcept = 0;
};

class static_tracking_ops : public io_napi_tracking_ops
{
public:
    bool do_busy_loop() noexcept override;
};

bool static_tracking_ops::do_busy_loop() noexcept
{
    return static_tracking_do_busy_loop();
}

bool testVirtual(io_napi_tracking_ops *ptr)
{
    return ptr->do_busy_loop();
}

bool testIfElseDispatch(int i)
{
    if (i == 0)
        return no_tracking_do_busy_loop();
    else if (i == 1)
        return dynamic_tracking_do_busy_loop();
    else
        return static_tracking_do_busy_loop();
}

void BM_test_virtual(benchmark::State &state)
{
    static_tracking_ops vObj;
    volatile io_napi_tracking_ops *ptr = &vObj;

    for (auto _ : state) {
        benchmark::DoNotOptimize(testVirtual(
            const_cast<io_napi_tracking_ops *>(ptr)));
    }
}

void BM_test_ifElse(benchmark::State &state)
{
    volatile int i = 2;

    for (auto _ : state) {
        benchmark::DoNotOptimize(testIfElseDispatch(i));
    }
}

BENCHMARK(BM_test_virtual);
BENCHMARK(BM_test_ifElse);
BENCHMARK_MAIN();


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2] abstract napi tracking strategy
  2024-08-15 22:17       ` Olivier Langlois
@ 2024-08-15 22:44         ` Olivier Langlois
  2024-08-16 14:26           ` Pavel Begunkov
  0 siblings, 1 reply; 16+ messages in thread
From: Olivier Langlois @ 2024-08-15 22:44 UTC (permalink / raw)
  To: Jens Axboe, Pavel Begunkov, io-uring

On Thu, 2024-08-15 at 18:17 -0400, Olivier Langlois wrote:
> 
> To my eyes, what is really important, it is that absolute best
> technical solution is choosen and the only way that this discussion
> can
> be done, it is with numbers. So I have created a small created a
> small
> benchmark program to compare a function pointer indirect call vs
> selecting a function in a 3 branches if/else if/else block. Here are
> the results:
> 
> ----------------------------------------------------------
> Benchmark                Time             CPU   Iterations
> ----------------------------------------------------------
> BM_test_virtual      0.628 ns        0.627 ns    930255515
> BM_test_ifElse        1.59 ns         1.58 ns    446805050
> 
I have fixed my benchmark:

----------------------------------------------------------
Benchmark                Time             CPU   Iterations
----------------------------------------------------------
BM_test_virtual       2.57 ns         2.53 ns    277764970
BM_test_ifElse        1.58 ns         1.57 ns    445197861

code:
using Func_t = bool (*)();

bool testVirtual(Func_t ptr)
{
    return ptr();
}

void BM_test_virtual(benchmark::State &state)
{
    volatile Func_t ptr = &static_tracking_do_busy_loop;

    for (auto _ : state) {
        benchmark::DoNotOptimize(testVirtual(ptr));
    }
}


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2] abstract napi tracking strategy
  2024-08-15 22:44         ` Olivier Langlois
@ 2024-08-16 14:26           ` Pavel Begunkov
  2024-09-16 18:29             ` Olivier Langlois
  0 siblings, 1 reply; 16+ messages in thread
From: Pavel Begunkov @ 2024-08-16 14:26 UTC (permalink / raw)
  To: Olivier Langlois, Jens Axboe, io-uring

On 8/15/24 23:44, Olivier Langlois wrote:
> On Thu, 2024-08-15 at 18:17 -0400, Olivier Langlois wrote:
>>
>> To my eyes, what is really important, it is that absolute best
>> technical solution is choosen and the only way that this discussion
>> can
>> be done, it is with numbers. So I have created a small created a
>> small
>> benchmark program to compare a function pointer indirect call vs
>> selecting a function in a 3 branches if/else if/else block. Here are
>> the results:

FWIW, it's just one branch / two different options in this case.
We shouldn't be doing a call if napi has never been requested,
so napi_dont_do_anything callback is not an option.

>> ----------------------------------------------------------
>> Benchmark                Time             CPU   Iterations
>> ----------------------------------------------------------
>> BM_test_virtual      0.628 ns        0.627 ns    930255515
>> BM_test_ifElse        1.59 ns         1.58 ns    446805050
>>
> I have fixed my benchmark:
> 
> ----------------------------------------------------------
> Benchmark                Time             CPU   Iterations
> ----------------------------------------------------------
> BM_test_virtual       2.57 ns         2.53 ns    277764970
> BM_test_ifElse        1.58 ns         1.57 ns    445197861

You passed the compiler flags for mitigations, right?

-mindirect-branch=thunk  -mfunction-return=thunk -mindirect-branch-register

Regardless of overhead, my concern is the complexity and
amount of extra code. It's just over engineered. It's like
adding a virtual templated hierarchy of classes just to
implement a program that prints "2".

I pushed what I had (2 last patches), you can use it as a
reference, but be aware that it's a completely untested
draft with some obvious problems and ugly uapi.

https://github.com/isilence/linux.git manual-napi
https://github.com/isilence/linux/commits/manual-napi/

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2] abstract napi tracking strategy
  2024-08-16 14:26           ` Pavel Begunkov
@ 2024-09-16 18:29             ` Olivier Langlois
  0 siblings, 0 replies; 16+ messages in thread
From: Olivier Langlois @ 2024-09-16 18:29 UTC (permalink / raw)
  To: Pavel Begunkov, Jens Axboe, io-uring

On Fri, 2024-08-16 at 15:26 +0100, Pavel Begunkov wrote:
> I pushed what I had (2 last patches), you can use it as a
> reference, but be aware that it's a completely untested
> draft with some obvious problems and ugly uapi.
> 
> https://github.com/isilence/linux.git manual-napi
> https://github.com/isilence/linux/commits/manual-napi/
> 
I did review what you have done...
I like your take on this feature idea... especially the autoremove
field.

the one aspect that I prefer in my version over yours, it is that my
implementation avoids totally to call __io_napi_add() which includes a
table lookup from io_poll_check_events() which is big chunk of the
dynamic tracking overhead.

also, when the napi devices are iterated in __io_napi_do_busy_loop(),
my version totally remove the conditional branching from the loop by
having 2 distinct busy loop functions.

but I guess the last point is arguably unimportant since the code is
busy polling...

Anyway, I had the time to create a v2 version of my implementation to
address the remarks made about v1 that I have completed testing. It has
been running on my system for the last 24h flawlessly...

I will post it here shortly. Please take a look at it and pick the best
features of both implementation.


^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2024-09-16 19:18 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-08-13 16:44 [PATCH 0/2] abstract napi tracking strategy Olivier Langlois
2024-08-13 17:10 ` [PATCH 1/2] io_uring/napi: Introduce io_napi_tracking_ops Olivier Langlois
2024-08-14 11:44   ` Olivier Langlois
2024-08-14 13:17     ` Jens Axboe
2024-08-13 17:11 ` [PATCH 2/2] io_uring/napi: add static napi tracking strategy Olivier Langlois
2024-08-13 18:33 ` [PATCH 0/2] abstract " Jens Axboe
2024-08-13 21:25   ` Olivier Langlois
2024-08-13 21:44     ` Jens Axboe
2024-08-15 22:17       ` Olivier Langlois
2024-08-15 22:44         ` Olivier Langlois
2024-08-16 14:26           ` Pavel Begunkov
2024-09-16 18:29             ` Olivier Langlois
2024-08-13 22:36     ` Pavel Begunkov
2024-08-14 13:28       ` Pavel Begunkov
2024-08-13 21:34   ` Olivier Langlois
2024-08-13 21:45     ` Jens Axboe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox