Latency and throughput benchmarks
diff --git a/bench/latency.cc b/bench/latency.cc
new file mode 100644
index 0000000..55d20ec
--- /dev/null
+++ b/bench/latency.cc
@@ -0,0 +1,80 @@
+#include <benchmark/benchmark.h>
+
+#include <unistd.h>
+
+#include <pthreadpool.h>
+
+
+static void SetNumberOfThreads(benchmark::internal::Benchmark* benchmark) {
+	const int maxThreads = sysconf(_SC_NPROCESSORS_ONLN);
+	for (int t = 0; t <= maxThreads; t++) {
+		benchmark->Arg(t);
+	}
+}
+
+
+static void compute_1d(void* context, size_t x) {
+}
+
+static void pthreadpool_compute_1d(benchmark::State& state) {
+	const uint32_t threads = static_cast<uint32_t>(state.range(0));
+	pthreadpool_t threadpool = threads == 0 ? NULL : pthreadpool_create(threads);
+	while (state.KeepRunning()) {
+		pthreadpool_compute_1d(threadpool, compute_1d, NULL, threads);
+	}
+	if (threads != 0) {
+		pthreadpool_destroy(threadpool);
+	}
+}
+BENCHMARK(pthreadpool_compute_1d)->UseRealTime()->Apply(SetNumberOfThreads);
+
+
+static void compute_1d_tiled(void* context, size_t x0, size_t xn) {
+}
+
+static void pthreadpool_compute_1d_tiled(benchmark::State& state) {
+	const uint32_t threads = static_cast<uint32_t>(state.range(0));
+	pthreadpool_t threadpool = threads == 0 ? NULL : pthreadpool_create(threads);
+	while (state.KeepRunning()) {
+		pthreadpool_compute_1d_tiled(threadpool, compute_1d_tiled, NULL, threads, 1);
+	}
+	if (threads != 0) {
+		pthreadpool_destroy(threadpool);
+	}
+}
+BENCHMARK(pthreadpool_compute_1d_tiled)->UseRealTime()->Apply(SetNumberOfThreads);
+
+
+static void compute_2d(void* context, size_t x, size_t y) {
+}
+
+static void pthreadpool_compute_2d(benchmark::State& state) {
+	const uint32_t threads = static_cast<uint32_t>(state.range(0));
+	pthreadpool_t threadpool = threads == 0 ? NULL : pthreadpool_create(threads);
+	while (state.KeepRunning()) {
+		pthreadpool_compute_2d(threadpool, compute_2d, NULL, 1, threads);
+	}
+	if (threads != 0) {
+		pthreadpool_destroy(threadpool);
+	}
+}
+BENCHMARK(pthreadpool_compute_2d)->UseRealTime()->Apply(SetNumberOfThreads);
+
+
+static void compute_2d_tiled(void* context, size_t x0, size_t y0, size_t xn, size_t yn) {
+}
+
+static void pthreadpool_compute_2d_tiled(benchmark::State& state) {
+	const uint32_t threads = static_cast<uint32_t>(state.range(0));
+	pthreadpool_t threadpool = threads == 0 ? NULL : pthreadpool_create(threads);
+	while (state.KeepRunning()) {
+		pthreadpool_compute_2d_tiled(threadpool, compute_2d_tiled, NULL, 1, threads, 1, 1);
+	}
+	if (threads != 0) {
+		pthreadpool_destroy(threadpool);
+	}
+}
+BENCHMARK(pthreadpool_compute_2d_tiled)->UseRealTime()->Apply(SetNumberOfThreads);
+
+
+BENCHMARK_MAIN();
diff --git a/bench/throughput.cc b/bench/throughput.cc
new file mode 100644
index 0000000..cef3442
--- /dev/null
+++ b/bench/throughput.cc
@@ -0,0 +1,80 @@
+#include <benchmark/benchmark.h>
+
+#include <unistd.h>
+
+#include <pthreadpool.h>
+
+
+static void compute_1d(void* context, size_t x) {
+}
+
+static void pthreadpool_compute_1d(benchmark::State& state) {
+	pthreadpool_t threadpool = pthreadpool_create(0);
+	const size_t threads = pthreadpool_get_threads_count(threadpool);
+	const size_t items = static_cast<size_t>(state.range(0));
+	while (state.KeepRunning()) {
+		pthreadpool_compute_1d(threadpool, compute_1d, NULL, items * threads);
+	}
+	pthreadpool_destroy(threadpool);
+
+	/* Do not normalize by thread */
+	state.SetItemsProcessed(int64_t(state.iterations()) * items);
+}
+BENCHMARK(pthreadpool_compute_1d)->UseRealTime()->RangeMultiplier(10)->Range(10, 1000000);
+
+
+static void compute_1d_tiled(void* context, size_t x0, size_t xn) {
+}
+
+static void pthreadpool_compute_1d_tiled(benchmark::State& state) {
+	pthreadpool_t threadpool = pthreadpool_create(0);
+	const size_t threads = pthreadpool_get_threads_count(threadpool);
+	const size_t items = static_cast<size_t>(state.range(0));
+	while (state.KeepRunning()) {
+		pthreadpool_compute_1d_tiled(threadpool, compute_1d_tiled, NULL, items * threads, 1);
+	}
+	pthreadpool_destroy(threadpool);
+
+	/* Do not normalize by thread */
+	state.SetItemsProcessed(int64_t(state.iterations()) * items);
+}
+BENCHMARK(pthreadpool_compute_1d_tiled)->UseRealTime()->RangeMultiplier(10)->Range(10, 1000000);
+
+
+static void compute_2d(void* context, size_t x, size_t y) {
+}
+
+static void pthreadpool_compute_2d(benchmark::State& state) {
+	pthreadpool_t threadpool = pthreadpool_create(0);
+	const size_t threads = pthreadpool_get_threads_count(threadpool);
+	const size_t items = static_cast<size_t>(state.range(0));
+	while (state.KeepRunning()) {
+		pthreadpool_compute_2d(threadpool, compute_2d, NULL, threads, items);
+	}
+	pthreadpool_destroy(threadpool);
+
+	/* Do not normalize by thread */
+	state.SetItemsProcessed(int64_t(state.iterations()) * items);
+}
+BENCHMARK(pthreadpool_compute_2d)->UseRealTime()->RangeMultiplier(10)->Range(10, 1000000);
+
+
+static void compute_2d_tiled(void* context, size_t x0, size_t y0, size_t xn, size_t yn) {
+}
+
+static void pthreadpool_compute_2d_tiled(benchmark::State& state) {
+	pthreadpool_t threadpool = pthreadpool_create(0);
+	const size_t threads = pthreadpool_get_threads_count(threadpool);
+	const size_t items = static_cast<size_t>(state.range(0));
+	while (state.KeepRunning()) {
+		pthreadpool_compute_2d_tiled(threadpool, compute_2d_tiled, NULL, threads, items, 1, 1);
+	}
+	pthreadpool_destroy(threadpool);
+
+	/* Do not normalize by thread */
+	state.SetItemsProcessed(int64_t(state.iterations()) * items);
+}
+BENCHMARK(pthreadpool_compute_2d_tiled)->UseRealTime()->RangeMultiplier(10)->Range(10, 1000000);
+
+
+BENCHMARK_MAIN();
diff --git a/configure.py b/configure.py
index 7a82481..4b1e93f 100755
--- a/configure.py
+++ b/configure.py
@@ -17,6 +17,10 @@
     with build.options(source_dir="test", deps=[build, build.deps.googletest]):
         build.unittest("pthreadpool-test", build.cxx("pthreadpool.cc"))
 
+    with build.options(source_dir="bench", deps=[build, build.deps.googlebenchmark]):
+        build.benchmark("latency-bench", build.cxx("latency.cc"))
+        build.benchmark("throughput-bench", build.cxx("throughput.cc"))
+
     return build
 
 
diff --git a/confu.yaml b/confu.yaml
index 4e86d5a..fc54f60 100644
--- a/confu.yaml
+++ b/confu.yaml
@@ -5,3 +5,4 @@
   - name: fxdiv
     url:  https://github.com/Maratyszcza/FXdiv.git
   - name: googletest
+  - name: googlebenchmark