From ecb29549cf226ed129caa7d43ce79e8b2e4d9575 Mon Sep 17 00:00:00 2001
From: Jacob Lifshay <programmerjake@gmail.com>
Date: Thu, 21 Jul 2022 03:18:09 -0700
Subject: [PATCH] benchmarking c++11 atomics works

---
 .vscode/launch.json                           |  18 ++
 .vscode/settings.json                         |   1 +
 CMakeLists.txt                                |  10 +-
 Makefile                                      |   1 +
 harness.cpp                                   |   8 -
 harness.h                                     |  57 ----
 src/aarch64/aarch64_benchmarks.cpp            |  20 ++
 src/aarch64/aarch64_benchmarks.h              |   5 +
 src/all_benchmarks.cpp                        |  28 ++
 src/all_benchmarks.h                          |   5 +
 .../c11_atomics/c11_atomics_benchmarks.cpp    | 223 +++++++++++++++
 .../c11_atomics/c11_atomics_benchmarks.h      |   5 +
 src/common/common_benchmarks.cpp              |   8 +
 src/common/common_benchmarks.h                |   5 +
 src/harness.cpp                               | 262 ++++++++++++++++++
 src/harness.h                                 | 117 ++++++++
 main.cpp => src/main.cpp                      | 104 ++++++-
 src/powerpc64le/powerpc64le_benchmarks.cpp    |  21 ++
 src/powerpc64le/powerpc64le_benchmarks.h      |   5 +
 src/x86_64/x86_64_benchmarks.cpp              |  20 ++
 src/x86_64/x86_64_benchmarks.h                |   5 +
 21 files changed, 857 insertions(+), 71 deletions(-)
 create mode 100644 .vscode/launch.json
 delete mode 100644 harness.cpp
 delete mode 100644 harness.h
 create mode 100644 src/aarch64/aarch64_benchmarks.cpp
 create mode 100644 src/aarch64/aarch64_benchmarks.h
 create mode 100644 src/all_benchmarks.cpp
 create mode 100644 src/all_benchmarks.h
 create mode 100644 src/common/c11_atomics/c11_atomics_benchmarks.cpp
 create mode 100644 src/common/c11_atomics/c11_atomics_benchmarks.h
 create mode 100644 src/common/common_benchmarks.cpp
 create mode 100644 src/common/common_benchmarks.h
 create mode 100644 src/harness.cpp
 create mode 100644 src/harness.h
 rename main.cpp => src/main.cpp (75%)
 create mode 100644 src/powerpc64le/powerpc64le_benchmarks.cpp
 create mode 100644 src/powerpc64le/powerpc64le_benchmarks.h
 create mode 100644 src/x86_64/x86_64_benchmarks.cpp
 create mode 100644 src/x86_64/x86_64_benchmarks.h

diff --git a/.vscode/launch.json b/.vscode/launch.json
new file mode 100644
index 0000000..1a14a9d
--- /dev/null
+++ b/.vscode/launch.json
@@ -0,0 +1,18 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "type": "lldb",
+            "request": "launch",
+            "name": "Debug",
+            "program": "${workspaceFolder}/build-x86_64/benchmarks",
+            "args": [
+                "-j1"
+            ],
+            "cwd": "${workspaceFolder}"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 630965c..c096ee7 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -4,4 +4,5 @@
         "-DCMAKE_TOOLCHAIN_FILE=toolchain-x86_64-linux-gnu.cmake"
     ],
     "cmake.copyCompileCommands": "${workspaceFolder}/compile_commands.json",
+    "cmake.generator": "Unix Makefiles",
 }
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f239c80..025462d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,11 +1,17 @@
-cmake_minimum_required(VERSION 3.11.0)
+cmake_minimum_required(VERSION 3.12.0)
 project(benchmarks VERSION 0.1.0)
 
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
+find_package(Threads REQUIRED)
+
 add_compile_options(-Wall -Wextra -Wimplicit-fallthrough)
-add_executable(benchmarks main.cpp harness.cpp)
+file(GLOB_RECURSE sources
+    RELATIVE ${CMAKE_SOURCE_DIR}
+    CONFIGURE_DEPENDS src/*.cpp src/*.c)
+add_executable(benchmarks "${sources}")
+target_link_libraries(benchmarks Threads::Threads)
 
 set(CPACK_PROJECT_NAME ${PROJECT_NAME})
 set(CPACK_PROJECT_VERSION ${PROJECT_VERSION})
diff --git a/Makefile b/Makefile
index a701537..64bacb5 100644
--- a/Makefile
+++ b/Makefile
@@ -16,6 +16,7 @@ all: $(foreach arch,$(enabled_arches),build-$(arch)/benchmarks)
 
 common_cmake_flags = -S .
 common_cmake_flags += -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=TRUE
+common_cmake_flags += -DCMAKE_BUILD_TYPE=RelWithDebInfo
 
 reset_make_env = "MAKEFLAGS=" "MFLAGS=" "MAKELEVEL=" "MAKE_TERMERR=" "MAKE_TERMOUT="
 
diff --git a/harness.cpp b/harness.cpp
deleted file mode 100644
index 2de66f0..0000000
--- a/harness.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-#include "harness.h"
-
-void BenchHarnessBase::base_run(
-    Config config, void (*fn)(BenchHarnessBase *bench_harness_base,
-                              std::uint64_t iteration_count))
-{
-    // FIXME: finish
-}
\ No newline at end of file
diff --git a/harness.h b/harness.h
deleted file mode 100644
index 3c4cd28..0000000
--- a/harness.h
+++ /dev/null
@@ -1,57 +0,0 @@
-#pragma once
-
-#include <chrono>
-#include <cstdint>
-#include <optional>
-#include <type_traits>
-#include <utility>
-
-struct Config final
-{
-    std::optional<std::uint32_t> thread_count;
-    std::optional<std::uint64_t> iteration_count;
-};
-
-template <typename Fn, typename Input,
-          typename Output = std::invoke_result_t<Fn, Input>>
-class BenchHarness;
-
-class BenchHarnessBase
-{
-    template <typename Fn, typename Input, typename Output>
-    friend class BenchHarness;
-
-  private:
-    void base_run(Config config,
-                  void (*fn)(BenchHarnessBase *bench_harness_base,
-                             std::uint64_t iteration_count));
-};
-
-template <typename Fn, typename Input>
-class BenchHarness<Fn, Input, std::invoke_result_t<Fn, Input>> final
-    : private BenchHarnessBase
-{
-  private:
-    Fn fn;
-
-  public:
-    void run(Config config)
-    {
-        base_run(config, [](BenchHarnessBase *bench_harness_base,
-                            std::uint64_t iteration_count) {
-            auto &fn = static_cast<BenchHarness *>(bench_harness_base)->fn;
-            for (std::uint64_t i = 0; i < iteration_count; i++)
-            {
-                Input input;
-
-                // optimization barrier
-                asm("" : : "r"(std::addressof(input)) : "memory");
-
-                auto output = fn(input);
-
-                // optimization barrier
-                asm("" : : "r"(std::addressof(output)) : "memory");
-            }
-        });
-    }
-};
\ No newline at end of file
diff --git a/src/aarch64/aarch64_benchmarks.cpp b/src/aarch64/aarch64_benchmarks.cpp
new file mode 100644
index 0000000..81f729c
--- /dev/null
+++ b/src/aarch64/aarch64_benchmarks.cpp
@@ -0,0 +1,20 @@
+#include "aarch64_benchmarks.h"
+
+#ifdef __aarch64__
+
+std::vector<Benchmark> aarch64_benchmarks(Config config)
+{
+    std::vector<Benchmark> retval;
+    // TODO: add aarch64 benchmarks
+    (void)config;
+    return retval;
+}
+
+#else
+
+std::vector<Benchmark> aarch64_benchmarks(Config)
+{
+    return {};
+}
+
+#endif
\ No newline at end of file
diff --git a/src/aarch64/aarch64_benchmarks.h b/src/aarch64/aarch64_benchmarks.h
new file mode 100644
index 0000000..b79cc9f
--- /dev/null
+++ b/src/aarch64/aarch64_benchmarks.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#include "../harness.h"
+
+std::vector<Benchmark> aarch64_benchmarks(Config config);
\ No newline at end of file
diff --git a/src/all_benchmarks.cpp b/src/all_benchmarks.cpp
new file mode 100644
index 0000000..748541c
--- /dev/null
+++ b/src/all_benchmarks.cpp
@@ -0,0 +1,28 @@
+#include "all_benchmarks.h"
+#include "aarch64/aarch64_benchmarks.h"
+#include "common/common_benchmarks.h"
+#include "powerpc64le/powerpc64le_benchmarks.h"
+#include "x86_64/x86_64_benchmarks.h"
+#include <algorithm>
+#include <iterator>
+
+std::vector<Benchmark> all_benchmarks(Config config)
+{
+    std::vector<Benchmark> retval = common_benchmarks(config);
+    {
+        auto benchmarks = x86_64_benchmarks(config);
+        std::move(benchmarks.begin(), benchmarks.end(),
+                  std::back_inserter(retval));
+    }
+    {
+        auto benchmarks = aarch64_benchmarks(config);
+        std::move(benchmarks.begin(), benchmarks.end(),
+                  std::back_inserter(retval));
+    }
+    {
+        auto benchmarks = powerpc64le_benchmarks(config);
+        std::move(benchmarks.begin(), benchmarks.end(),
+                  std::back_inserter(retval));
+    }
+    return retval;
+}
\ No newline at end of file
diff --git a/src/all_benchmarks.h b/src/all_benchmarks.h
new file mode 100644
index 0000000..9447131
--- /dev/null
+++ b/src/all_benchmarks.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#include "harness.h"
+
+std::vector<Benchmark> all_benchmarks(Config config);
\ No newline at end of file
diff --git a/src/common/c11_atomics/c11_atomics_benchmarks.cpp b/src/common/c11_atomics/c11_atomics_benchmarks.cpp
new file mode 100644
index 0000000..13dc5af
--- /dev/null
+++ b/src/common/c11_atomics/c11_atomics_benchmarks.cpp
@@ -0,0 +1,223 @@
+#include "c11_atomics_benchmarks.h"
+#include <atomic>
+#include <cstdint>
+#include <memory>
+#include <sstream>
+#include <string_view>
+#include <type_traits>
+
+template <typename T> struct IntTypeName;
+
+template <typename T>
+inline constexpr std::string_view int_type_name = IntTypeName<T>::name;
+
+#define INT_TYPE_NAME(sz, ui, uint)                                           \
+    template <> struct IntTypeName<std::uint##sz##_t> final                   \
+    {                                                                         \
+        static constexpr std::string_view name = #ui #sz;                     \
+    };
+
+INT_TYPE_NAME(8, u, uint)
+INT_TYPE_NAME(16, u, uint)
+INT_TYPE_NAME(32, u, uint)
+INT_TYPE_NAME(64, u, uint)
+INT_TYPE_NAME(8, i, int)
+INT_TYPE_NAME(16, i, int)
+INT_TYPE_NAME(32, i, int)
+INT_TYPE_NAME(64, i, int)
+
+template <std::memory_order order> struct MemoryOrderName;
+
+template <std::memory_order order>
+inline constexpr std::string_view memory_order_name =
+    MemoryOrderName<order>::name;
+
+#define MEMORY_ORDER_NAME(order)                                              \
+    template <> struct MemoryOrderName<std::memory_order_##order> final       \
+    {                                                                         \
+        static constexpr std::string_view name = #order;                      \
+    };
+
+MEMORY_ORDER_NAME(relaxed)
+MEMORY_ORDER_NAME(acquire)
+MEMORY_ORDER_NAME(release)
+MEMORY_ORDER_NAME(acq_rel)
+MEMORY_ORDER_NAME(seq_cst)
+
+template <typename T> using Buf = std::shared_ptr<std::vector<std::atomic<T>>>;
+
+template <typename Fn, typename T, typename... NameParts>
+static void push_atomic_bench(std::vector<Benchmark> &benches, Config config,
+                              Buf<T> buf, Fn fn, NameParts &&...name_parts)
+{
+    auto log2_stride = config.log2_stride;
+    std::size_t index_mask = 1;
+    index_mask <<= config.log2_stride;
+    index_mask <<= config.log2_memory_location_count;
+    index_mask--;
+    push_bench(
+        benches,
+        [buf, fn, index_mask, log2_stride](T input, std::uint64_t iteration,
+                                           std::uint32_t thread_num) {
+            std::size_t index = iteration;
+            index ^= static_cast<std::size_t>(thread_num) * 0x12345;
+            index <<= log2_stride;
+            index &= index_mask;
+            std::atomic<T> *atomic = &(*buf)[index];
+            input ^= static_cast<T>(iteration);
+            return fn(input, atomic);
+        },
+        T{}, name_parts...);
+}
+
+template <typename T, std::memory_order order>
+static void rmw_benchmarks(std::vector<Benchmark> &benches, Config config,
+                           Buf<T> buf)
+{
+    push_atomic_bench(
+        benches, config, buf,
+        [](T input, std::atomic<T> *atomic) {
+            return std::atomic_exchange_explicit(atomic, input, order);
+        },
+        "atomic_exchange_", int_type_name<T>, "_", memory_order_name<order>);
+    push_atomic_bench(
+        benches, config, buf,
+        [](T input, std::atomic<T> *atomic) {
+            return std::atomic_fetch_add_explicit(atomic, input, order);
+        },
+        "atomic_fetch_add_", int_type_name<T>, "_", memory_order_name<order>);
+    push_atomic_bench(
+        benches, config, buf,
+        [](T input, std::atomic<T> *atomic) {
+            return std::atomic_fetch_sub_explicit(atomic, input, order);
+        },
+        "atomic_fetch_sub_", int_type_name<T>, "_", memory_order_name<order>);
+    push_atomic_bench(
+        benches, config, buf,
+        [](T input, std::atomic<T> *atomic) {
+            return std::atomic_fetch_and_explicit(atomic, input, order);
+        },
+        "atomic_fetch_and_", int_type_name<T>, "_", memory_order_name<order>);
+    push_atomic_bench(
+        benches, config, buf,
+        [](T input, std::atomic<T> *atomic) {
+            return std::atomic_fetch_or_explicit(atomic, input, order);
+        },
+        "atomic_fetch_or_", int_type_name<T>, "_", memory_order_name<order>);
+    push_atomic_bench(
+        benches, config, buf,
+        [](T input, std::atomic<T> *atomic) {
+            return std::atomic_fetch_xor_explicit(atomic, input, order);
+        },
+        "atomic_fetch_xor_", int_type_name<T>, "_", memory_order_name<order>);
+}
+
+template <typename T, std::memory_order order>
+static void load_benchmarks(std::vector<Benchmark> &benches, Config config,
+                            Buf<T> buf)
+{
+    push_atomic_bench(
+        benches, config, buf,
+        [](T, std::atomic<T> *atomic) {
+            return std::atomic_load_explicit(atomic, order);
+        },
+        "atomic_load_", int_type_name<T>, "_", memory_order_name<order>);
+}
+
+template <typename T, std::memory_order order>
+static void store_benchmarks(std::vector<Benchmark> &benches, Config config,
+                             Buf<T> buf)
+{
+    push_atomic_bench(
+        benches, config, buf,
+        [](T input, std::atomic<T> *atomic) {
+            return std::atomic_store_explicit(atomic, input, order);
+        },
+        "atomic_store_", int_type_name<T>, "_", memory_order_name<order>);
+}
+
+template <typename T, std::memory_order succ, std::memory_order fail>
+static void cmp_xchg_benchmarks(std::vector<Benchmark> &benches, Config config,
+                                Buf<T> buf)
+{
+    push_atomic_bench(
+        benches, config, buf,
+        [](T input, std::atomic<T> *atomic) {
+            T expected = input >> 1;
+            bool succeeded = std::atomic_compare_exchange_weak_explicit(
+                atomic, &expected, input, succ, fail);
+            return std::pair(expected, succeeded);
+        },
+        "atomic_compare_exchange_weak_", int_type_name<T>, "_",
+        memory_order_name<succ>, "_", memory_order_name<fail>);
+    push_atomic_bench(
+        benches, config, buf,
+        [](T input, std::atomic<T> *atomic) {
+            T expected = input >> 1;
+            bool succeeded = std::atomic_compare_exchange_strong_explicit(
+                atomic, &expected, input, succ, fail);
+            return std::pair(expected, succeeded);
+        },
+        "atomic_compare_exchange_strong_", int_type_name<T>, "_",
+        memory_order_name<succ>, "_", memory_order_name<fail>);
+}
+
+template <typename T>
+static void benchmarks(std::vector<Benchmark> &benches, Config config)
+{
+    std::size_t buf_size = 1;
+    buf_size <<= config.log2_memory_location_count;
+    buf_size <<= config.log2_stride;
+    Buf<T> buf = std::make_shared<std::vector<std::atomic<T>>>(buf_size);
+
+    rmw_benchmarks<T, std::memory_order_relaxed>(benches, config, buf);
+    rmw_benchmarks<T, std::memory_order_acquire>(benches, config, buf);
+    rmw_benchmarks<T, std::memory_order_release>(benches, config, buf);
+    rmw_benchmarks<T, std::memory_order_acq_rel>(benches, config, buf);
+    rmw_benchmarks<T, std::memory_order_seq_cst>(benches, config, buf);
+
+    load_benchmarks<T, std::memory_order_relaxed>(benches, config, buf);
+    load_benchmarks<T, std::memory_order_acquire>(benches, config, buf);
+    load_benchmarks<T, std::memory_order_seq_cst>(benches, config, buf);
+
+    store_benchmarks<T, std::memory_order_relaxed>(benches, config, buf);
+    store_benchmarks<T, std::memory_order_release>(benches, config, buf);
+    store_benchmarks<T, std::memory_order_seq_cst>(benches, config, buf);
+
+    cmp_xchg_benchmarks<T, std::memory_order_relaxed,
+                        std::memory_order_relaxed>(benches, config, buf);
+
+    cmp_xchg_benchmarks<T, std::memory_order_acquire,
+                        std::memory_order_relaxed>(benches, config, buf);
+    cmp_xchg_benchmarks<T, std::memory_order_acquire,
+                        std::memory_order_acquire>(benches, config, buf);
+
+    cmp_xchg_benchmarks<T, std::memory_order_release,
+                        std::memory_order_relaxed>(benches, config, buf);
+
+    cmp_xchg_benchmarks<T, std::memory_order_acq_rel,
+                        std::memory_order_relaxed>(benches, config, buf);
+    cmp_xchg_benchmarks<T, std::memory_order_acq_rel,
+                        std::memory_order_acquire>(benches, config, buf);
+
+    cmp_xchg_benchmarks<T, std::memory_order_seq_cst,
+                        std::memory_order_relaxed>(benches, config, buf);
+    cmp_xchg_benchmarks<T, std::memory_order_seq_cst,
+                        std::memory_order_acquire>(benches, config, buf);
+    cmp_xchg_benchmarks<T, std::memory_order_seq_cst,
+                        std::memory_order_seq_cst>(benches, config, buf);
+}
+
+std::vector<Benchmark> c11_atomics_benchmarks(Config config)
+{
+    std::vector<Benchmark> benches;
+    benchmarks<std::uint8_t>(benches, config);
+    benchmarks<std::uint16_t>(benches, config);
+    benchmarks<std::uint32_t>(benches, config);
+    benchmarks<std::uint64_t>(benches, config);
+    benchmarks<std::int8_t>(benches, config);
+    benchmarks<std::int16_t>(benches, config);
+    benchmarks<std::int32_t>(benches, config);
+    benchmarks<std::int64_t>(benches, config);
+    return benches;
+}
\ No newline at end of file
diff --git a/src/common/c11_atomics/c11_atomics_benchmarks.h b/src/common/c11_atomics/c11_atomics_benchmarks.h
new file mode 100644
index 0000000..9966297
--- /dev/null
+++ b/src/common/c11_atomics/c11_atomics_benchmarks.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#include "../../harness.h"
+
+std::vector<Benchmark> c11_atomics_benchmarks(Config config);
\ No newline at end of file
diff --git a/src/common/common_benchmarks.cpp b/src/common/common_benchmarks.cpp
new file mode 100644
index 0000000..42007a4
--- /dev/null
+++ b/src/common/common_benchmarks.cpp
@@ -0,0 +1,8 @@
+#include "common_benchmarks.h"
+#include "c11_atomics/c11_atomics_benchmarks.h"
+
+std::vector<Benchmark> common_benchmarks(Config config)
+{
+    auto retval = c11_atomics_benchmarks(config);
+    return retval;
+}
\ No newline at end of file
diff --git a/src/common/common_benchmarks.h b/src/common/common_benchmarks.h
new file mode 100644
index 0000000..9505408
--- /dev/null
+++ b/src/common/common_benchmarks.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#include "../harness.h"
+
+std::vector<Benchmark> common_benchmarks(Config config);
\ No newline at end of file
diff --git a/src/harness.cpp b/src/harness.cpp
new file mode 100644
index 0000000..affe919
--- /dev/null
+++ b/src/harness.cpp
@@ -0,0 +1,262 @@
+#include "harness.h"
+#include <atomic>
+#include <chrono>
+#include <cmath>
+#include <condition_variable>
+#include <functional>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <ostream>
+#include <shared_mutex>
+#include <thread>
+#include <variant>
+
+#ifdef NDEBUG // assert needs to work even in release mode
+#undef NDEBUG
+#endif
+#include <cassert>
+
+using std::chrono::steady_clock;
+
+class BenchHarnessBase::ThreadCache final
+{
+  private:
+    std::vector<std::thread> threads;
+    std::shared_mutex state_lock;
+    std::unique_lock<std::shared_mutex> locked_state;
+    std::condition_variable_any cond_var;
+    struct UnlockGuard final
+    {
+        std::shared_mutex &state_lock;
+        UnlockGuard(std::shared_mutex &state_lock) : state_lock(state_lock)
+        {
+            state_lock.unlock();
+        }
+        ~UnlockGuard()
+        {
+            state_lock.lock();
+        }
+    };
+    struct Task final
+    {
+        std::function<void()> fn;
+    };
+    struct ThreadState final
+    {
+        std::unique_ptr<Task> task;
+        std::mutex mutex;
+    };
+    std::vector<std::shared_ptr<ThreadState>> states;
+    bool shutting_down = false;
+    std::atomic_size_t tasks_left_to_drain = 0;
+    void add_thread()
+    {
+        auto thread_state = std::make_shared<ThreadState>();
+        states.push_back(thread_state);
+        threads.push_back(std::thread([this, thread_state]() {
+            auto shared_lock = std::shared_lock(state_lock);
+            while (true)
+            {
+                auto lock = std::unique_lock(thread_state->mutex);
+                auto task = std::move(thread_state->task);
+                lock.unlock();
+                if (task)
+                {
+                    task->fn();
+                    task.reset();
+                    tasks_left_to_drain--;
+                    cond_var.notify_all();
+                    continue;
+                }
+
+                if (this->shutting_down)
+                    return;
+
+                cond_var.wait(shared_lock);
+            }
+        }));
+    }
+
+  public:
+    ThreadCache()
+    {
+        locked_state = std::unique_lock(state_lock);
+    }
+    ThreadCache(const ThreadCache &) = delete;
+    ThreadCache &operator=(const ThreadCache &) = delete;
+    ~ThreadCache()
+    {
+        shutting_down = true;
+        cond_var.notify_all();
+        locked_state.unlock();
+        for (auto &thread : threads)
+        {
+            thread.join();
+        }
+    }
+    static std::shared_ptr<ThreadCache> get()
+    {
+        // weak so it's destroyed before returning from main()
+        static std::weak_ptr<ThreadCache> static_thread_cache;
+
+        std::shared_ptr<ThreadCache> thread_cache = static_thread_cache.lock();
+        if (!thread_cache)
+        {
+            thread_cache = std::make_shared<ThreadCache>();
+            static_thread_cache = thread_cache;
+        }
+        return thread_cache;
+    }
+    static std::shared_ptr<ThreadCache> get(BenchHarnessBase &bhb,
+                                            std::uint32_t thread_count)
+    {
+        std::shared_ptr<ThreadCache> thread_cache = get();
+        bhb.thread_cache = thread_cache;
+        while (thread_cache->threads.size() < thread_count)
+            thread_cache->add_thread();
+        return thread_cache;
+    }
+    void drain()
+    {
+        while (tasks_left_to_drain > 0)
+        {
+            // unlocks state_lock, allowing all threads to proceed
+            // simultaneously
+            cond_var.wait(locked_state);
+        }
+    }
+    template <typename Fn> void schedule_on(std::uint32_t thread_num, Fn fn)
+    {
+        auto lock = std::unique_lock(states[thread_num]->mutex);
+        assert(!states[thread_num]->task);
+        tasks_left_to_drain++;
+        states[thread_num]->task = std::make_unique<Task>(Task{.fn = fn});
+        cond_var.notify_all();
+    }
+};
+
+struct WriteDuration final
+{
+    std::chrono::duration<double> dur;
+    friend std::ostream &operator<<(std::ostream &os,
+                                    const WriteDuration &wdur)
+    {
+        double dur = wdur.dur.count();
+        if (!std::isfinite(dur) || std::fabs(dur) > 0.1)
+        {
+            os << dur << " sec";
+        }
+        else if (std::fabs(dur) > 0.1e-3)
+        {
+            os << dur * 1e3 << " ms";
+        }
+        else if (std::fabs(dur) > 0.1e-6)
+        {
+            os << dur * 1e6 << " us";
+        }
+        else if (std::fabs(dur) > 0.1e-9)
+        {
+            os << dur * 1e9 << " ns";
+        }
+        else if (std::fabs(dur) > 0.1e-12)
+        {
+            os << dur * 1e12 << " ps";
+        }
+        else
+        {
+            os << dur << " sec";
+        }
+        return os;
+    }
+};
+
+void BenchHarnessBase::base_run(
+    Config config,
+    void (*fn)(BenchHarnessBase *bench_harness_base,
+               std::uint64_t iteration_count, std::uint32_t thread_num))
+{
+
+    std::uint32_t thread_count =
+        config.thread_count.value_or(std::thread::hardware_concurrency());
+    bool no_threads =
+        thread_count == 0 || (thread_count == 1 && !config.thread_count);
+    if (no_threads)
+    {
+        thread_count = 1;
+    }
+
+    std::vector<steady_clock::duration> elapsed(thread_count);
+    auto run_base = [&](std::uint64_t iteration_count,
+                        std::uint32_t thread_num) {
+        auto start_time = steady_clock::now();
+        fn(this, iteration_count, thread_num);
+        auto end_time = steady_clock::now();
+        elapsed[thread_num] = end_time - start_time;
+    };
+    auto run = [&](std::uint64_t iteration_count) {
+        if (no_threads)
+        {
+            return run_base(iteration_count, 0);
+        }
+        auto thread_cache = ThreadCache::get(*this, thread_count);
+        for (std::uint32_t thread_num = 0; thread_num < thread_count;
+             thread_num++)
+        {
+            thread_cache->schedule_on(
+                thread_num, [&run_base, iteration_count, thread_num]() {
+                    run_base(iteration_count, thread_num);
+                });
+        }
+        thread_cache->drain();
+    };
+    std::uint64_t iteration_count = 1;
+    if (config.iteration_count)
+    {
+        iteration_count = *config.iteration_count;
+        run(iteration_count);
+    }
+    else
+    {
+        while (true)
+        {
+            run(iteration_count);
+            steady_clock::duration total_elapsed{};
+            for (auto i : elapsed)
+            {
+                total_elapsed += i;
+            }
+            auto target_average_elapsed = std::chrono::milliseconds(500);
+            if (total_elapsed > thread_count * target_average_elapsed)
+            {
+                break;
+            }
+            iteration_count <<= 1;
+        }
+    }
+    steady_clock::duration total_elapsed{};
+    for (std::uint32_t thread_num = 0; thread_num < thread_count; thread_num++)
+    {
+        total_elapsed += elapsed[thread_num];
+        if (thread_count > 1)
+        {
+            auto dur = std::chrono::duration<double>(elapsed[thread_num]);
+            std::cout << "Thread #" << thread_num << " took "
+                      << WriteDuration{dur} << " for " << iteration_count
+                      << " iterations -- "
+                      << WriteDuration{dur / iteration_count} << "/iter.\n";
+        }
+    }
+    auto total = std::chrono::duration<double>(total_elapsed);
+    std::cout << "Average elapsed time: "
+              << WriteDuration{total / thread_count} << " for "
+              << iteration_count << " iterations -- "
+              << WriteDuration{total / thread_count / iteration_count}
+              << "/iter.\n"
+              << std::endl;
+}
+
+std::shared_ptr<void> BenchHarnessBase::get_thread_cache()
+{
+    return ThreadCache::get();
+}
\ No newline at end of file
diff --git a/src/harness.h b/src/harness.h
new file mode 100644
index 0000000..aacd27a
--- /dev/null
+++ b/src/harness.h
@@ -0,0 +1,117 @@
+#pragma once
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <sstream>
+#include <string>
+#include <string_view>
+#include <type_traits>
+#include <utility>
+
+struct Config final
+{
+    std::optional<std::uint32_t> thread_count;
+    std::optional<std::uint64_t> iteration_count;
+    std::uint32_t log2_memory_location_count = 0;
+    std::uint32_t log2_stride = 0;
+    static constexpr std::uint32_t max_sum_log2_mem_loc_count_and_stride = 28;
+};
+
+template <typename Fn, typename Input> class BenchHarness;
+
+class BenchHarnessBase
+{
+    template <typename Fn, typename Input> friend class BenchHarness;
+
+  private:
+    std::shared_ptr<void> thread_cache;
+    class ThreadCache;
+    friend class ThreadCache;
+    void base_run(Config config,
+                  void (*fn)(BenchHarnessBase *bench_harness_base,
+                             std::uint64_t iteration_count,
+                             std::uint32_t thread_num));
+
+  public:
+    static std::shared_ptr<void> get_thread_cache();
+};
+
+template <typename Fn, typename Input>
+class BenchHarness final : private BenchHarnessBase
+{
+  private:
+    Fn fn;
+    Input input;
+
+  public:
+    BenchHarness(Fn fn, Input input)
+        : fn(std::move(fn)), input(std::move(input))
+    {
+    }
+    void run(Config config)
+    {
+        base_run(config, [](BenchHarnessBase *bench_harness_base,
+                            std::uint64_t iteration_count,
+                            std::uint32_t thread_num) {
+            auto self = static_cast<BenchHarness *>(bench_harness_base);
+            auto &fn = self->fn;
+            // copy for repeatability, also so optimization barrier is on copy,
+            // not self
+            auto input = self->input;
+            for (std::uint64_t i = 0; i < iteration_count; i++)
+            {
+                // optimization barrier
+                asm("" : : "r"(std::addressof(input)) : "memory");
+
+                if constexpr (std::is_void_v<std::invoke_result_t<
+                                  Fn &, Input, decltype(i),
+                                  decltype(thread_num)>>)
+                {
+                    fn(input, i, thread_num);
+                }
+                else
+                {
+                    auto output = fn(input, i, thread_num);
+
+                    // optimization barrier
+                    asm("" : : "r"(std::addressof(output)) : "memory");
+                }
+            }
+        });
+    }
+};
+
+class Benchmark final
+{
+  private:
+    std::string m_name;
+    std::function<void(Config config)> m_run;
+
+  public:
+    template <typename Fn, typename Input>
+    explicit Benchmark(Fn fn, Input input, std::string name)
+        : m_name(std::move(name)), m_run([fn, input](Config config) {
+              return BenchHarness(std::move(fn), std::move(input)).run(config);
+          })
+    {
+    }
+    void run(Config config)
+    {
+        return m_run(config);
+    }
+    const std::string &name() const
+    {
+        return m_name;
+    }
+};
+
+template <typename Fn, typename Input, typename... NameParts>
+void push_bench(std::vector<Benchmark> &benches, Fn fn, Input input,
+                NameParts &&...name_parts)
+{
+    std::ostringstream os;
+    (os << ... << std::forward<NameParts>(name_parts));
+    benches.push_back(Benchmark(std::move(fn), std::move(input), os.str()));
+}
diff --git a/main.cpp b/src/main.cpp
similarity index 75%
rename from main.cpp
rename to src/main.cpp
index 2008393..feafbb1 100644
--- a/main.cpp
+++ b/src/main.cpp
@@ -1,15 +1,18 @@
+#include "all_benchmarks.h"
 #include "harness.h"
 #include <charconv>
 #include <cstdlib>
 #include <functional>
 #include <initializer_list>
 #include <iostream>
+#include <limits>
 #include <map>
 #include <optional>
 #include <ostream>
 #include <string_view>
 #include <system_error>
 #include <type_traits>
+#include <unordered_set>
 #include <vector>
 
 using namespace std::literals;
@@ -315,9 +318,17 @@ class OptionsParser final
         }
         std::exit(sizeof...(error_msg) == 0 ? 0 : 1);
     }
+    template <typename Int, typename = void> struct ParseIntLimits;
+    template <typename Int>
+    struct ParseIntLimits<Int, std::enable_if_t<std::is_integral_v<Int>>> final
+    {
+        Int min_value = std::numeric_limits<Int>::min();
+        Int max_value = std::numeric_limits<Int>::max();
+    };
     template <typename Int>
     std::enable_if_t<std::is_integral_v<Int>, void> parse_int(
-        std::optional<std::string_view> value, Int &i_value)
+        std::optional<std::string_view> value, Int &i_value,
+        ParseIntLimits<Int> limits = {})
     {
         i_value = Int();
         if (!value)
@@ -347,6 +358,13 @@ class OptionsParser final
         {
             result.ec = std::errc::invalid_argument;
         }
+        if (result.ec == std::errc())
+        {
+            if (i_value < limits.min_value || i_value > limits.max_value)
+            {
+                result.ec = std::errc::result_out_of_range;
+            }
+        }
         if (result.ec == std::errc::result_out_of_range)
         {
             help_and_exit("value out of range: ", current_option(), "=",
@@ -360,7 +378,7 @@ class OptionsParser final
     template <typename Int>
     std::enable_if_t<std::is_integral_v<Int>, void> parse_int(
         std::optional<std::string_view> value, std::optional<Int> &i_value,
-        bool required = true)
+        bool required = true, ParseIntLimits<Int> limits = {})
     {
         if (!required && !value)
         {
@@ -368,7 +386,7 @@ class OptionsParser final
             return;
         }
         i_value.emplace();
-        this->parse_int(value, i_value.value());
+        this->parse_int(value, i_value.value(), limits);
     }
 };
 
@@ -381,6 +399,7 @@ inline std::vector<std::string_view> Options::parse(
 int main(int, char **argv)
 {
     Config config{};
+    std::optional<std::unordered_set<std::string>> enabled_benchmarks;
     Options options{
         Option{
             .short_name = 'h',
@@ -406,6 +425,44 @@ int main(int, char **argv)
                    [&](OptionsParser &parser, auto value) {
                        parser.parse_int(value, config.iteration_count);
                    }},
+        Option{.long_name = "log2-mem-loc-count",
+               .description =
+                   "Log base 2 of the number of memory locations to access",
+               .value_kind = OptionValueKind::Required,
+               .parse_value =
+                   [&](OptionsParser &parser, auto value) {
+                       parser.parse_int(
+                           value, config.log2_memory_location_count,
+                           {.max_value =
+                                Config::max_sum_log2_mem_loc_count_and_stride -
+                                config.log2_stride});
+                   }},
+        Option{
+            .long_name = "log2-stride",
+            .description =
+                "Log base 2 of the stride used for accessing memory locations",
+            .value_kind = OptionValueKind::Required,
+            .parse_value =
+                [&](OptionsParser &parser, auto value) {
+                    parser.parse_int(
+                        value, config.log2_stride,
+                        {.max_value =
+                             Config::max_sum_log2_mem_loc_count_and_stride -
+                             config.log2_memory_location_count});
+                }},
+        Option{
+            .short_name = 'b',
+            .long_name = "bench",
+            .description = "List of benchmarks that should be run",
+            .value_kind = OptionValueKind::Required,
+            .parse_value =
+                [&](OptionsParser &, std::optional<std::string_view> value) {
+                    if (!enabled_benchmarks)
+                    {
+                        enabled_benchmarks.emplace();
+                    }
+                    enabled_benchmarks->emplace(value.value_or(""));
+                }},
     };
     OptionsParser parser(options, argv);
     auto args = parser.parse();
@@ -413,6 +470,45 @@ int main(int, char **argv)
     {
         parser.help_and_exit("unexpected argument");
     }
-    // TODO: invoke benchmarks
+    auto benchmarks = all_benchmarks(config);
+    if (enabled_benchmarks)
+    {
+        enabled_benchmarks->erase("");
+        enabled_benchmarks->erase("help");
+        enabled_benchmarks->erase("list");
+        if (enabled_benchmarks->empty())
+        {
+            std::cout << "Available Benchmarks:\n";
+            for (auto &benchmark : benchmarks)
+            {
+                std::cout << benchmark.name() << "\n";
+            }
+            std::cout << std::endl;
+            return 0;
+        }
+        std::unordered_set<std::string> unknown_benchmarks =
+            *enabled_benchmarks;
+        for (auto &benchmark : benchmarks)
+        {
+            unknown_benchmarks.erase(benchmark.name());
+        }
+        if (!unknown_benchmarks.empty())
+        {
+            parser.help_and_exit(
+                "unknown benchmark: ", *unknown_benchmarks.begin(),
+                "\nrun with `--bench=list` to see all supported benchmarks.");
+        }
+    }
+    auto thread_cache = BenchHarnessBase::get_thread_cache();
+    for (auto &benchmark : benchmarks)
+    {
+        if (enabled_benchmarks && !enabled_benchmarks->count(benchmark.name()))
+        {
+            continue;
+        }
+        std::cout << "Running: " << benchmark.name() << std::endl;
+        benchmark.run(config);
+    }
+    std::cout << std::endl;
     return 0;
 }
diff --git a/src/powerpc64le/powerpc64le_benchmarks.cpp b/src/powerpc64le/powerpc64le_benchmarks.cpp
new file mode 100644
index 0000000..da0a672
--- /dev/null
+++ b/src/powerpc64le/powerpc64le_benchmarks.cpp
@@ -0,0 +1,21 @@
+#include "powerpc64le_benchmarks.h"
+#include <endian.h>
+
+#if defined(__powerpc64__) && BYTE_ORDER == LITTLE_ENDIAN
+
+std::vector<Benchmark> powerpc64le_benchmarks(Config config)
+{
+    std::vector<Benchmark> retval;
+    // TODO: add powerpc64le benchmarks
+    (void)config;
+    return retval;
+}
+
+#else
+
+std::vector<Benchmark> powerpc64le_benchmarks(Config)
+{
+    return {};
+}
+
+#endif
\ No newline at end of file
diff --git a/src/powerpc64le/powerpc64le_benchmarks.h b/src/powerpc64le/powerpc64le_benchmarks.h
new file mode 100644
index 0000000..808685d
--- /dev/null
+++ b/src/powerpc64le/powerpc64le_benchmarks.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#include "../harness.h"
+
+std::vector<Benchmark> powerpc64le_benchmarks(Config config);
\ No newline at end of file
diff --git a/src/x86_64/x86_64_benchmarks.cpp b/src/x86_64/x86_64_benchmarks.cpp
new file mode 100644
index 0000000..9bc145f
--- /dev/null
+++ b/src/x86_64/x86_64_benchmarks.cpp
@@ -0,0 +1,20 @@
+#include "x86_64_benchmarks.h"
+
+#ifdef __x86_64__
+
+std::vector<Benchmark> x86_64_benchmarks(Config config)
+{
+    std::vector<Benchmark> retval;
+    // TODO: add x86_64 benchmarks
+    (void)config;
+    return retval;
+}
+
+#else
+
+std::vector<Benchmark> x86_64_benchmarks(Config)
+{
+    return {};
+}
+
+#endif
\ No newline at end of file
diff --git a/src/x86_64/x86_64_benchmarks.h b/src/x86_64/x86_64_benchmarks.h
new file mode 100644
index 0000000..ed228af
--- /dev/null
+++ b/src/x86_64/x86_64_benchmarks.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#include "../harness.h"
+
+std::vector<Benchmark> x86_64_benchmarks(Config config);
\ No newline at end of file
-- 
2.30.2