From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <ammarfaizi2@gnuweeb.org>
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on gnuweeb.org
X-Spam-Level: 
X-Spam-Status: No, score=-0.8 required=5.0 tests=ALL_TRUSTED,DKIM_SIGNED,
	DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF,NO_DNS_FOR_FROM,URIBL_BLOCKED
	autolearn=no autolearn_force=no version=3.4.6
Received: from localhost.localdomain (unknown [182.2.68.216])
	by gnuweeb.org (Postfix) with ESMTPSA id 0F2698057F;
	Mon, 29 Aug 2022 01:12:05 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=gnuweeb.org;
	s=default; t=1661735527;
	bh=pJCfqYOK0zlSBI3EcdPt5SsxwNVTczlGb3LZp1jqYYg=;
	h=From:To:Cc:Subject:Date:In-Reply-To:References:From;
	b=rXNnAwJeoi0silufBFkCWC5PAbwFMcJ0tf0u6p5f/VD9ndCPTvU/+q4yCiDYmCudK
	 w2wJF2qfCfzhGdOtOl5tC9yYr+XysyDYEWKWXN3H/h6vHt3JjtUCehiYgcgqJb2CvU
	 17l82VhOpmEDMBJxVCGS9zgmh49AN+EM1cITBGipIwn7QyDgw6Qt/anH4n9TVCqV0U
	 TmUw5JAe38hJ+4H25Su+MMrqdpgI1M72s/9MhkChvehXYJ4+HT/abcGQmc90a8NFjK
	 LVOira7bdQIpy1JOR8OsSDX1pQHzQmxfvbB+jixF5/+hjmtOTxLhWbEblEoHet35Ff
	 wDBBwGCbKSoWA==
From: Ammar Faizi <ammarfaizi2@gnuweeb.org>
To: Alviro Iskandar Setiawan <alviro.iskandar@gnuweeb.org>
Cc: Ammar Faizi <ammarfaizi2@gnuweeb.org>,
	Muhammad Rizki <kiizuha@gnuweeb.org>,
	Kanna Scarlet <knscarlet@gnuweeb.org>,
	GNU/Weeb Mailing List <gwml@vger.gnuweeb.org>
Subject: [RFC PATCH v1 2/2] chnet: Implement `get_thread()` and `put_thread()` function
Date: Mon, 29 Aug 2022 08:11:27 +0700
Message-Id: <20220829011127.3150320-3-ammarfaizi2@gnuweeb.org>
X-Mailer: git-send-email 2.34.1
In-Reply-To: <20220829011127.3150320-1-ammarfaizi2@gnuweeb.org>
References: <20220829011127.3150320-1-ammarfaizi2@gnuweeb.org>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
List-Id: <gwml.vger.gnuweeb.org>

Currently, a single chnet instance uses a single dedicated chromium
thread worker to perform an HTTP request. This doesn't scale well
because we need to spawn a new thread for each HTTP request.

Performing 4096 HTTP requests in parallel will spawn 4096 chromium
thread workers, which is too expensive and consuming too much
memory.

A single chromium thread worker can handle multiple HTTP requests.
This series creates a fixed number of chromium workers with ref
count to spread the jobs fairly across the chromium thread workers.

This greatly reduces the context switches and improve the performance.
It also greatly reduces the memory usage.

Implementation:

1) At initialization, when chnet_global_init() is called, create an
   array of pointers to `struct ch_thpool` and initialize those
   pointers to null. The number of elements in the array is
   taken from `std::thread::hardware_concurrency() - 1`.

2) When a new CHNet instance is created, its constructor calls
   `get_thread()` function which will initialize the array of
   pointers to `struct ch_thpool` if needed, then increment
   the ref count and it returns a pointer to the base::Thread
   class in `struct ch_thpool`.

3) When a CHNet instance is destroyed, it calls `put_thread()`
   function that will decrement the ref count of the
   `struct ch_thpool` and delete the object if the ref count
   reaches zero after getting decremented.

For the ring test case, the gained speedup is 33%.

Without this patch:
  ammarfaizi2@integral2:~/work/ncns$ time taskset -c 0-7 make -j8 test -s
  Running /home/ammarfaizi2/work/ncns/tests/cpp/ring.t

  real    0m28.184s
  user    0m52.368s
  sys     0m27.582s

With this patch:
  ammarfaizi2@integral2:~/work/ncns$ time taskset -c 0-7 make -j8 test -s
  Running /home/ammarfaizi2/work/ncns/tests/cpp/ring.t

  real    0m18.657s
  user    0m35.452s
  sys     0m2.146s

Signed-off-by: Ammar Faizi <ammarfaizi2@gnuweeb.org>
---
 chnet/chnet.cc | 81 +++++++++++++++++++++++++++++++++++++++++++++++++-
 chnet/chnet.h  |  2 +-
 2 files changed, 81 insertions(+), 2 deletions(-)
diff --git a/chnet/chnet.cc b/chnet/chnet.cc
index 8662374..782e4fe 100644
--- a/chnet/chnet.cc
+++ b/chnet/chnet.cc
@@ -12,6 +12,9 @@
 #include <cstdlib>
 using namespace std::chrono_literals;
 
+static base::Thread *get_thread(void);
+static void put_thread(void *thread);
+
 namespace net {
 
 CHNetSimplePayload::CHNetSimplePayload(const char *payload, size_t payload_len):
@@ -251,7 +254,7 @@ net::DefineNetworkTrafficAnnotation("CHNetDelegate", R"(
 	})");
 
 CHNetDelegate::CHNetDelegate(void):
-	thread_("chromium_thread"),
+	thread_(*get_thread()),
 	method_("GET"),
 	err_("")
 {
@@ -287,6 +290,7 @@ CHNetDelegate::~CHNetDelegate(void)
 	r->PostTask(FROM_HERE, base::BindOnce(CHNetDelegateDestruct, &url_req_,
 					      &url_req_ctx_, &sig));
 	sig.Wait();
+	put_thread(&thread_);
 }
 
 template <typename T, typename... Types>
@@ -629,6 +633,81 @@ static uint32_t g_max_ch_thpool;
 static std::mutex g_thpool_lock_;
 static struct ch_thpool **g_thpool;
 
+
+static base::Thread *get_thread(void)
+{
+	const uint32_t max_ch_thpool = g_max_ch_thpool;
+	const uint32_t nr_ref_split = 2048;
+	struct ch_thpool **thp;
+	struct ch_thpool *ret = nullptr;
+	struct ch_thpool *tmp;
+	uint32_t min_ref_idx;
+	uint32_t min_ref;
+	uint32_t i;
+
+	g_thpool_lock_.lock();
+	thp = g_thpool;
+	if (!thp) {
+		g_thpool_lock_.unlock();
+		return nullptr;
+	}
+
+	tmp = thp[0];
+	if (!tmp) {
+		ret = new struct ch_thpool;
+		ret->idx_ = 0;
+		thp[0] = ret;
+		goto out;
+	}
+
+	min_ref = tmp->ref_count_;
+	min_ref_idx = 0;
+	for (i = 1; i < max_ch_thpool; i++) {
+		uint32_t ref;
+
+		tmp = thp[i];
+		if (!tmp) {
+			ret = new struct ch_thpool;
+			ret->idx_ = i;
+			thp[i] = ret;
+			goto out;
+		}
+
+		ref = tmp->ref_count_;
+		if (ref < nr_ref_split) {
+			ret = tmp;
+			break;
+		}
+
+		if (ref < min_ref) {
+			min_ref = ref;
+			min_ref_idx = i;
+		}
+	}
+
+	if (!ret)
+		ret = thp[min_ref_idx];
+
+out:
+	ret->ref_count_++;
+	g_thpool_lock_.unlock();
+	return &ret->thread_;
+}
+
+static void put_thread(void *thread)
+{
+	struct ch_thpool *th = (struct ch_thpool *)thread;
+
+	g_thpool_lock_.lock();
+	if (--th->ref_count_ == 0) {
+		if (g_thpool)
+			g_thpool[th->idx_] = nullptr;
+
+		delete th;
+	}
+	g_thpool_lock_.unlock();
+}
+
 static void init_g_ch_thpool(void)
 {
 	struct ch_thpool **tmp;
diff --git a/chnet/chnet.h b/chnet/chnet.h
index 2332596..8f7b098 100644
--- a/chnet/chnet.h
+++ b/chnet/chnet.h
@@ -275,7 +275,7 @@ private:
 	std::unique_ptr<URLRequestContext>		url_req_ctx_;
 	std::unique_ptr<URLRequest>			url_req_;
 	scoped_refptr<IOBufferWithSize>			read_buf_;
-	base::Thread					thread_;
+	base::Thread					&thread_;
 	std::atomic<int>				read_ret_;
 	std::atomic<int>				status_;
 	std::string					method_;
-- 
Ammar Faizi