Merge remote-tracking branch 'aosp/upstream-master' into upstream_merge am: ac64db9550 am: e2b9dee957 am: 3323a85476
am: b4950db3d7

Change-Id: Ida2ed2e1136f75f7f98b050169b58a5f21f5979e
diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000..4b3f13f
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,5 @@
+---
+Language:        Cpp
+BasedOnStyle:  Google
+...
+
diff --git a/.travis-libcxx-setup.sh b/.travis-libcxx-setup.sh
new file mode 100644
index 0000000..1b6b585
--- /dev/null
+++ b/.travis-libcxx-setup.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+# Install a newer CMake version
+curl -sSL https://cmake.org/files/v3.6/cmake-3.6.1-Linux-x86_64.sh -o install-cmake.sh
+chmod +x install-cmake.sh
+sudo ./install-cmake.sh --prefix=/usr/local --skip-license
+
+# Checkout LLVM sources
+git clone --depth=1 https://github.com/llvm-mirror/llvm.git llvm-source
+git clone --depth=1 https://github.com/llvm-mirror/libcxx.git llvm-source/projects/libcxx
+git clone --depth=1 https://github.com/llvm-mirror/libcxxabi.git llvm-source/projects/libcxxabi
+
+# Build and install libc++ (Use unstable ABI for better sanitizer coverage)
+mkdir llvm-build && cd llvm-build
+cmake -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${COMPILER} \
+      -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_INSTALL_PREFIX=/usr \
+      -DLIBCXX_ABI_UNSTABLE=ON \
+      -DLLVM_USE_SANITIZER=${LIBCXX_SANITIZER} \
+      ../llvm-source
+make cxx -j2
+sudo make install-cxxabi install-cxx
+cd ../
diff --git a/.travis-setup.sh b/.travis-setup.sh
deleted file mode 100644
index c900fa9..0000000
--- a/.travis-setup.sh
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/usr/bin/env bash
-
-# Before install
-
-sudo add-apt-repository -y ppa:kalakris/cmake
-if [ "$STD" = "c++11" ]; then
-    sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
-    if [ "$CXX" = "clang++" ]; then
-        wget -O - http://llvm.org/apt/llvm-snapshot.gpg.key | sudo apt-key add -
-        sudo add-apt-repository -y "deb http://llvm.org/apt/precise/ llvm-toolchain-precise-3.6 main"
-    fi
-fi
-sudo apt-get update -qq
-
-# Install
-sudo apt-get install -qq cmake
-if [ "$STD" = "c++11" ] && [ "$CXX" = "g++" ]; then
-    sudo apt-get install -qq gcc-4.8 g++-4.8
-    sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.8 90
-    sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.8 90
-elif [ "$CXX" = "clang++" ]; then
-    sudo apt-get install -qq clang-3.6
-    sudo update-alternatives --install /usr/local/bin/clang   clang   /usr/bin/clang-3.6 90
-    sudo update-alternatives --install /usr/local/bin/clang++ clang++ /usr/bin/clang++-3.6 90
-    export PATH=/usr/local/bin:$PATH
-fi
diff --git a/.travis.yml b/.travis.yml
index bf26395..19c68dd 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,26 +1,85 @@
+sudo: required
+dist: trusty
 language: cpp
 
+env:
+  global:
+    - /usr/local/bin:$PATH
+
 # NOTE: The COMPILER variable is unused. It simply makes the display on
 # travis-ci.org more readable.
 matrix:
     include:
         - compiler: gcc
-          env: COMPILER=g++-4.6     STD=c++0x BUILD_TYPE=Coverage
+          addons:
+            apt:
+              packages:
+                - lcov
+          env: COMPILER=g++ C_COMPILER=gcc       BUILD_TYPE=Coverage
         - compiler: gcc
-          env: COMPILER=g++-4.6     STD=c++0x BUILD_TYPE=Debug
+          env: COMPILER=g++ C_COMPILER=gcc       BUILD_TYPE=Debug
         - compiler: gcc
-          env: COMPILER=g++-4.6     STD=c++0x BUILD_TYPE=Release
+          env: COMPILER=g++ C_COMPILER=gcc       BUILD_TYPE=Release
         - compiler: gcc
-          env: COMPILER=g++-4.8     STD=c++11 BUILD_TYPE=Debug
-        - compiler: gcc
-          env: COMPILER=g++-4.8     STD=c++11 BUILD_TYPE=Release
+          addons:
+            apt:
+              sources:
+                - ubuntu-toolchain-r-test
+              packages:
+                - g++-6
+          env:
+            - COMPILER=g++-6 C_COMPILER=gcc-6  BUILD_TYPE=Debug
+            - EXTRA_FLAGS="-fno-omit-frame-pointer -g -O2 -fsanitize=undefined,address -fuse-ld=gold"
         - compiler: clang
-          env: COMPILER=clang++-3.6 STD=c++11 BUILD_TYPE=Debug
+          env: COMPILER=clang++ C_COMPILER=clang BUILD_TYPE=Debug
         - compiler: clang
-          env: COMPILER=clang++-3.6 STD=c++11 BUILD_TYPE=Release
+          env: COMPILER=clang++ C_COMPILER=clang BUILD_TYPE=Release
+        # Clang w/ libc++
+        - compiler: clang
+          addons:
+            apt:
+              packages:
+                clang-3.8
+          env:
+            - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
+            - LIBCXX_BUILD=1
+            - EXTRA_FLAGS="-stdlib=libc++"
+        # Clang w/ libc++, ASAN, UBSAN
+        - compiler: clang
+          addons:
+            apt:
+              packages:
+                clang-3.8
+          env:
+            - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
+            - LIBCXX_BUILD=1 LIBCXX_SANITIZER="Undefined;Address"
+            - EXTRA_FLAGS="-stdlib=libc++ -fno-omit-frame-pointer -g -O2 -fsanitize=undefined,address -fno-sanitize-recover=all"
+            - UBSAN_OPTIONS=print_stacktrace=1
+        # Clang w/ libc++ and MSAN
+        - compiler: clang
+          addons:
+            apt:
+              packages:
+                clang-3.8
+          env:
+            - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
+            - LIBCXX_BUILD=1 LIBCXX_SANITIZER=MemoryWithOrigins
+            - EXTRA_FLAGS="-stdlib=libc++ -g -O2 -fno-omit-frame-pointer -fsanitize=memory -fsanitize-memory-track-origins"
+        # Clang w/ libc++ and MSAN
+        - compiler: clang
+          addons:
+            apt:
+              packages:
+                clang-3.8
+          env:
+            - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=RelWithDebInfo
+            - LIBCXX_BUILD=1 LIBCXX_SANITIZER=Thread
+            - EXTRA_FLAGS="-stdlib=libc++ -g -O2 -fno-omit-frame-pointer -fsanitize=thread -fno-sanitize-recover=all"
 
 before_script:
-    - source .travis-setup.sh
+    - if [ -n "${LIBCXX_BUILD}" ]; then
+        source .travis-libcxx-setup.sh;
+      fi
     - mkdir build && cd build
 
 install:
@@ -31,7 +90,7 @@
     fi
 
 script:
-    - cmake .. -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DCMAKE_CXX_FLAGS="-std=${STD}"
+    - cmake -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${COMPILER} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DCMAKE_CXX_FLAGS="${EXTRA_FLAGS}" ..
     - make
     - make CTEST_OUTPUT_ON_FAILURE=1 test
 
diff --git a/AUTHORS b/AUTHORS
index 0f93e01..5a545fa 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -13,6 +13,7 @@
 Christopher Seymour <chris.j.seymour@hotmail.com>
 David Coeurjolly <david.coeurjolly@liris.cnrs.fr>
 Dominic Hamon <dma@stripysock.com>
+Eric Fiselier <eric@efcs.ca>
 Eugene Zhuk <eugene.zhuk@gmail.com>
 Evgeny Safronov <division494@gmail.com>
 Felix Homann <linuxaudio@showlabor.de>
@@ -23,6 +24,7 @@
 Kaito Udagawa <umireon@gmail.com>
 Lei Xu <eddyxu@gmail.com>
 Matt Clarkson <mattyclarkson@gmail.com>
+Nick Hutchinson <nshutchinson@gmail.com>
 Oleksandr Sochka <sasha.sochka@gmail.com>
 Paul Redmond <paul.redmond@gmail.com>
 Radoslav Yovchev <radoslav.tm@gmail.com>
diff --git a/Android.bp b/Android.bp
index 14a598b..ec822e5 100644
--- a/Android.bp
+++ b/Android.bp
@@ -30,19 +30,18 @@
 
     srcs: [
         "src/benchmark.cc",
+        "src/benchmark_register.cc",
         "src/colorprint.cc",
         "src/commandlineflags.cc",
         "src/complexity.cc",
         "src/console_reporter.cc",
         "src/csv_reporter.cc",
         "src/json_reporter.cc",
-        "src/log.cc",
         "src/reporter.cc",
-        "src/re_posix.cc",
         "src/sleep.cc",
         "src/string_util.cc",
         "src/sysinfo.cc",
-        "src/walltime.cc",
+        "src/timers.cc",
     ],
     export_include_dirs: ["include"],
 }
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a1251e7..8bfd21b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,6 +12,7 @@
 
 option(BENCHMARK_ENABLE_TESTING "Enable testing of the benchmark library." ON)
 option(BENCHMARK_ENABLE_LTO "Enable link time optimisation of the benchmark library." OFF)
+option(BENCHMARK_USE_LIBCXX "Build and test using libc++ as the standard library." OFF)
 # Make sure we can import out CMake functions
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 
@@ -78,17 +79,26 @@
   add_cxx_compiler_flag(-pedantic-errors)
   add_cxx_compiler_flag(-Wshorten-64-to-32)
   add_cxx_compiler_flag(-Wfloat-equal)
-  add_cxx_compiler_flag(-Wzero-as-null-pointer-constant)
   add_cxx_compiler_flag(-fstrict-aliasing)
+  if (NOT BENCHMARK_USE_LIBCXX)
+    add_cxx_compiler_flag(-Wzero-as-null-pointer-constant)
+  endif()
   if (HAVE_CXX_FLAG_FSTRICT_ALIASING)
     add_cxx_compiler_flag(-Wstrict-aliasing)
   endif()
   add_cxx_compiler_flag(-Wthread-safety)
-  if (HAVE_WTHREAD_SAFETY)
-    add_definitions(-DHAVE_WTHREAD_SAFETY)
+  if (HAVE_CXX_FLAG_WTHREAD_SAFETY)
     cxx_feature_check(THREAD_SAFETY_ATTRIBUTES)
   endif()
 
+  # On most UNIX like platforms g++ and clang++ define _GNU_SOURCE as a
+  # predefined macro, which turns on all of the wonderful libc extensions.
+  # However g++ doesn't do this in Cygwin so we have to define it ourselfs
+  # since we depend on GNU/POSIX/BSD extensions.
+  if (CYGWIN)
+    add_definitions(-D_GNU_SOURCE=1)
+  endif()
+
   # Link time optimisation
   if (BENCHMARK_ENABLE_LTO)
     add_cxx_compiler_flag(-flto)
@@ -126,12 +136,34 @@
   add_cxx_compiler_flag(--coverage COVERAGE)
 endif()
 
+if (BENCHMARK_USE_LIBCXX)
+  if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+    add_cxx_compiler_flag(-stdlib=libc++)
+  elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" OR
+          "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
+    add_cxx_compiler_flag(-nostdinc++)
+    message("libc++ header path must be manually specified using CMAKE_CXX_FLAGS")
+    # Adding -nodefaultlibs directly to CMAKE_<TYPE>_LINKER_FLAGS will break
+    # configuration checks such as 'find_package(Threads)'
+    list(APPEND BENCHMARK_CXX_LINKER_FLAGS -nodefaultlibs)
+    # -lc++ cannot be added directly to CMAKE_<TYPE>_LINKER_FLAGS because
+    # linker flags appear before all linker inputs and -lc++ must appear after.
+    list(APPEND BENCHMARK_CXX_LIBRARIES c++)
+  else()
+    message(FATAL "-DBENCHMARK_USE_LIBCXX:BOOL=ON is not supported for compiler")
+  endif()
+endif(BENCHMARK_USE_LIBCXX)
+
 # C++ feature checks
+# Determine the correct regular expression engine to use
 cxx_feature_check(STD_REGEX)
 cxx_feature_check(GNU_POSIX_REGEX)
 cxx_feature_check(POSIX_REGEX)
-cxx_feature_check(STEADY_CLOCK)
+if(NOT HAVE_STD_REGEX AND NOT HAVE_GNU_POSIX_REGEX AND NOT HAVE_POSIX_REGEX)
+  message(FATAL_ERROR "Failed to determine the source files for the regular expression backend")
+endif()
 
+cxx_feature_check(STEADY_CLOCK)
 # Ensure we have pthreads
 find_package(Threads REQUIRED)
 
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
index 4bff126..33cd941 100644
--- a/CONTRIBUTORS
+++ b/CONTRIBUTORS
@@ -40,6 +40,7 @@
 Kai Wolf <kai.wolf@gmail.com>
 Lei Xu <eddyxu@gmail.com>
 Matt Clarkson <mattyclarkson@gmail.com>
+Nick Hutchinson <nshutchinson@gmail.com>
 Oleksandr Sochka <sasha.sochka@gmail.com>
 Pascal Leroy <phl@google.com>
 Paul Redmond <paul.redmond@gmail.com>
diff --git a/README.md b/README.md
index a0bcc61..9109430 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,8 @@
 
 IRC channel: https://freenode.net #googlebenchmark
 
+[Known issues and common problems](#known-issues)
+
 ## Example usage
 ### Basic usage
 Define a function that executes the code to be measured.
@@ -40,13 +42,13 @@
 
 ```c++
 static void BM_memcpy(benchmark::State& state) {
-  char* src = new char[state.range_x()];
-  char* dst = new char[state.range_x()];
-  memset(src, 'x', state.range_x());
+  char* src = new char[state.range(0)];
+  char* dst = new char[state.range(0)];
+  memset(src, 'x', state.range(0));
   while (state.KeepRunning())
-    memcpy(dst, src, state.range_x());
+    memcpy(dst, src, state.range(0));
   state.SetBytesProcessed(int64_t(state.iterations()) *
-                          int64_t(state.range_x()));
+                          int64_t(state.range(0)));
   delete[] src;
   delete[] dst;
 }
@@ -70,7 +72,7 @@
 ```
 Now arguments generated are [ 8, 16, 32, 64, 128, 256, 512, 1024, 2k, 4k, 8k ].
 
-You might have a benchmark that depends on two inputs. For example, the
+You might have a benchmark that depends on two or more inputs. For example, the
 following code defines a family of benchmarks for measuring the speed of set
 insertion.
 
@@ -78,21 +80,21 @@
 static void BM_SetInsert(benchmark::State& state) {
   while (state.KeepRunning()) {
     state.PauseTiming();
-    std::set<int> data = ConstructRandomSet(state.range_x());
+    std::set<int> data = ConstructRandomSet(state.range(0));
     state.ResumeTiming();
-    for (int j = 0; j < state.range_y(); ++j)
+    for (int j = 0; j < state.range(1); ++j)
       data.insert(RandomNumber());
   }
 }
 BENCHMARK(BM_SetInsert)
-    ->ArgPair(1<<10, 1)
-    ->ArgPair(1<<10, 8)
-    ->ArgPair(1<<10, 64)
-    ->ArgPair(1<<10, 512)
-    ->ArgPair(8<<10, 1)
-    ->ArgPair(8<<10, 8)
-    ->ArgPair(8<<10, 64)
-    ->ArgPair(8<<10, 512);
+    ->Args({1<<10, 1})
+    ->Args({1<<10, 8})
+    ->Args({1<<10, 64})
+    ->Args({1<<10, 512})
+    ->Args({8<<10, 1})
+    ->Args({8<<10, 8})
+    ->Args({8<<10, 64})
+    ->Args({8<<10, 512});
 ```
 
 The preceding code is quite repetitive, and can be replaced with the following
@@ -101,7 +103,7 @@
 pair.
 
 ```c++
-BENCHMARK(BM_SetInsert)->RangePair(1<<10, 8<<10, 1, 512);
+BENCHMARK(BM_SetInsert)->Ranges({{1<<10, 8<<10}, {1, 512}});
 ```
 
 For more complex patterns of inputs, passing a custom function to `Apply` allows
@@ -113,7 +115,7 @@
 static void CustomArguments(benchmark::internal::Benchmark* b) {
   for (int i = 0; i <= 10; ++i)
     for (int j = 32; j <= 1024*1024; j *= 8)
-      b->ArgPair(i, j);
+      b->Args({i, j});
 }
 BENCHMARK(BM_SetInsert)->Apply(CustomArguments);
 ```
@@ -125,12 +127,12 @@
 
 ```c++
 static void BM_StringCompare(benchmark::State& state) {
-  std::string s1(state.range_x(), '-');
-  std::string s2(state.range_x(), '-');
+  std::string s1(state.range(0), '-');
+  std::string s2(state.range(0), '-');
   while (state.KeepRunning()) {
     benchmark::DoNotOptimize(s1.compare(s2));
   }
-  state.SetComplexityN(state.range_x());
+  state.SetComplexityN(state.range(0));
 }
 BENCHMARK(BM_StringCompare)
     ->RangeMultiplier(2)->Range(1<<10, 1<<18)->Complexity(benchmark::oN);
@@ -162,14 +164,14 @@
   Q q;
   typename Q::value_type v;
   while (state.KeepRunning()) {
-    for (int i = state.range_x(); i--; )
+    for (int i = state.range(0); i--; )
       q.push(v);
-    for (int e = state.range_x(); e--; )
+    for (int e = state.range(0); e--; )
       q.Wait(&v);
   }
   // actually messages, not bytes:
   state.SetBytesProcessed(
-      static_cast<int64_t>(state.iterations())*state.range_x());
+      static_cast<int64_t>(state.iterations())*state.range(0));
 }
 BENCHMARK_TEMPLATE(BM_Sequential, WaitQueue<int>)->Range(1<<0, 1<<10);
 ```
@@ -206,6 +208,34 @@
 Note that elements of `...args` may refer to global variables. Users should
 avoid modifying global state inside of a benchmark.
 
+## Using RegisterBenchmark(name, fn, args...)
+
+The `RegisterBenchmark(name, func, args...)` function provides an alternative
+way to create and register benchmarks.
+`RegisterBenchmark(name, func, args...)` creates, registers, and returns a
+pointer to a new benchmark with the specified `name` that invokes
+`func(st, args...)` where `st` is a `benchmark::State` object.
+
+Unlike the `BENCHMARK` registration macros, which can only be used at the global
+scope, the `RegisterBenchmark` can be called anywhere. This allows for
+benchmark tests to be registered programmatically.
+
+Additionally `RegisterBenchmark` allows any callable object to be registered
+as a benchmark. Including capturing lambdas and function objects. This
+allows the creation
+
+For Example:
+```c++
+auto BM_test = [](benchmark::State& st, auto Inputs) { /* ... */ };
+
+int main(int argc, char** argv) {
+  for (auto& test_input : { /* ... */ })
+      benchmark::RegisterBenchmark(test_input.name(), BM_test, test_input);
+  benchmark::Initialize(&argc, argv);
+  benchmark::RunSpecifiedBenchmarks();
+}
+```
+
 ### Multithreaded benchmarks
 In a multithreaded test (benchmark invoked by multiple threads simultaneously),
 it is guaranteed that none of the threads will start until all have called
@@ -256,7 +286,7 @@
 
 ```c++
 static void BM_ManualTiming(benchmark::State& state) {
-  int microseconds = state.range_x();
+  int microseconds = state.range(0);
   std::chrono::duration<double, std::micro> sleep_duration {
     static_cast<double>(microseconds)
   };
@@ -363,6 +393,13 @@
 `Repetitions` on the registered benchmark object. When a benchmark is run
 more than once the mean and standard deviation of the runs will be reported.
 
+Additionally the `--benchmark_report_aggregates_only={true|false}` flag or
+`ReportAggregatesOnly(bool)` function can be used to change how repeated tests
+are reported. By default the result of each repeated run is reported. When this
+option is 'true' only the mean and standard deviation of the runs is reported.
+Calling `ReportAggregatesOnly(bool)` on a registered benchmark object overrides
+the value of the flag for that benchmark.
+
 ## Fixtures
 Fixture tests are created by
 first defining a type that derives from ::benchmark::Fixture and then
@@ -425,12 +462,30 @@
 }
 ```
 
+## Running a subset of the benchmarks
+
+The `--benchmark_filter=<regex>` option can be used to only run the benchmarks
+which match the specified `<regex>`. For example:
+
+```bash
+$ ./run_benchmarks.x --benchmark_filter=BM_memcpy/32
+Run on (1 X 2300 MHz CPU )
+2016-06-25 19:34:24
+Benchmark              Time           CPU Iterations
+----------------------------------------------------
+BM_memcpy/32          11 ns         11 ns   79545455
+BM_memcpy/32k       2181 ns       2185 ns     324074
+BM_memcpy/32          12 ns         12 ns   54687500
+BM_memcpy/32k       1834 ns       1837 ns     357143
+```
+
+
 ## Output Formats
 The library supports multiple output formats. Use the
-`--benchmark_format=<tabular|json|csv>` flag to set the format type. `tabular` is
-the default format.
+`--benchmark_format=<console|json|csv>` flag to set the format type. `console`
+is the default format.
 
-The Tabular format is intended to be a human readable format. By default
+The Console format is intended to be a human readable format. By default
 the format generates color output. Context is output on stderr and the 
 tabular data on stdout. Example tabular output looks like:
 ```
@@ -493,6 +548,12 @@
 "BM_SetInsert/1024/10",106365,17238.4,8421.53,4.74973e+06,1.18743e+06,
 ```
 
+## Output Files
+The library supports writing the output of the benchmark to a file specified
+by `--benchmark_out=<filename>`. The format of the output can be specified
+using `--benchmark_out_format={json|console|csv}`. Specifying
+`--benchmark_out` does not suppress the console output.
+
 ## Debug vs Release
 By default, benchmark builds as a debug library. You will see a warning in the output when this is the case. To build it as a release library instead, use:
 
@@ -507,4 +568,30 @@
 ```
 
 ## Linking against the library
-When using gcc, it is necessary to link against pthread to avoid runtime exceptions. This is due to how gcc implements std::thread. See [issue #67](https://github.com/google/benchmark/issues/67) for more details.
+When using gcc, it is necessary to link against pthread to avoid runtime exceptions.
+This is due to how gcc implements std::thread.
+See [issue #67](https://github.com/google/benchmark/issues/67) for more details.
+
+## Compiler Support
+
+Google Benchmark uses C++11 when building the library. As such we require
+a modern C++ toolchain, both compiler and standard library.
+
+The following minimum versions are strongly recommended build the library:
+
+* GCC 4.8
+* Clang 3.4
+* Visual Studio 2013
+
+Anything older *may* work.
+
+Note: Using the library and its headers in C++03 is supported. C++11 is only
+required to build the library.
+
+# Known Issues
+
+### Windows
+
+* Users must manually link `shlwapi.lib`. Failure to do so may result
+in unresolved symbols.
+
diff --git a/appveyor.yml b/appveyor.yml
index 13be7fa..204f30d 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -1,21 +1,50 @@
 version: '{build}'
 
 configuration:
-  - Static Debug
-  - Static Release
-#  - Shared Debug
-#  - Shared Release
-
-platform:
-  - x86
-  - x64
+  - Debug
+  - Release
 
 environment:
   matrix:
     - compiler: msvc-12-seh
+      generator: "Visual Studio 12 2013"
+
+    - compiler: msvc-12-seh
+      generator: "Visual Studio 12 2013 Win64"
+
     - compiler: msvc-14-seh
-    - compiler: gcc-4.9.2-posix
-#    - compiler: gcc-4.8.4-posix
+      generator: "Visual Studio 14 2015"
+
+    - compiler: msvc-14-seh
+      generator: "Visual Studio 14 2015 Win64"
+
+    - compiler: gcc-5.3.0-posix
+      generator: "MinGW Makefiles"
+      cxx_path: 'C:\mingw-w64\i686-5.3.0-posix-dwarf-rt_v4-rev0\mingw32\bin'
+
+matrix:
+  fast_finish: true
+
+install:
+  # git bash conflicts with MinGW makefiles
+  - if "%generator%"=="MinGW Makefiles" (set "PATH=%PATH:C:\Program Files\Git\usr\bin;=%")
+  - if not "%cxx_path%"=="" (set "PATH=%PATH%;%cxx_path%")
+
+# TODO Remove this. This is a hack to work around bogus warning messages
+# See http://goo.gl/euguBI for more information.
+before_build:
+  - del "C:\Program Files (x86)\MSBuild\14.0\Microsoft.Common.targets\ImportAfter\Xamarin.Common.targets"
+  - del "C:\Program Files (x86)\MSBuild\12.0\Microsoft.Common.targets\ImportAfter\Xamarin.Common.targets"
+
+build_script:
+  - md _build -Force
+  - cd _build
+  - echo %configuration%
+  - cmake -G "%generator%" "-DCMAKE_BUILD_TYPE=%configuration%" ..
+  - cmake --build . --config %configuration%
+
+test_script:
+  - ctest -c %configuration% --timeout 300 --output-on-failure
 
 artifacts:
   - path: '_build/CMakeFiles/*.log'
@@ -23,105 +52,3 @@
   - path: '_build/Testing/**/*.xml'
     name: test_results
 
-install:
-  # derive some extra information
-  - for /f "tokens=1-2" %%a in ("%configuration%") do (@set "linkage=%%a")
-  - for /f "tokens=1-2" %%a in ("%configuration%") do (@set "variant=%%b")
-  - if "%linkage%"=="Shared" (set shared=YES) else (set shared=NO)
-  - for /f "tokens=1-3 delims=-" %%a in ("%compiler%") do (@set "compiler_name=%%a")
-  - for /f "tokens=1-3 delims=-" %%a in ("%compiler%") do (@set "compiler_version=%%b")
-  - for /f "tokens=1-3 delims=-" %%a in ("%compiler%") do (@set "compiler_threading=%%c")
-  - if "%platform%"=="x64" (set arch=x86_64)
-  - if "%platform%"=="x86" (set arch=i686)
-  # download the specific version of MinGW
-  - if "%compiler_name%"=="gcc" (for /f %%a in ('python mingw.py --quiet --version "%compiler_version%" --arch "%arch%" --threading "%compiler_threading%" --location "C:\mingw-builds"') do @set "compiler_path=%%a")
-
-before_build:
-  # Set up mingw commands
-  - if "%compiler_name%"=="gcc" (set "generator=MinGW Makefiles")
-  - if "%compiler_name%"=="gcc" (set "build=mingw32-make -j4")
-  - if "%compiler_name%"=="gcc" (set "test=mingw32-make CTEST_OUTPUT_ON_FAILURE=1 test")
-  # msvc specific commands
-  - if "%compiler_name%"=="msvc" if "%compiler_version%"=="12" if "%platform%"=="x86" (set "generator=Visual Studio 12 2013")
-  - if "%compiler_name%"=="msvc" if "%compiler_version%"=="12" if "%platform%"=="x64" (set "generator=Visual Studio 12 2013 Win64")
-  - if "%compiler_name%"=="msvc" if "%compiler_version%"=="14" if "%platform%"=="x86" (set "generator=Visual Studio 14 2015")
-  - if "%compiler_name%"=="msvc" if "%compiler_version%"=="14" if "%platform%"=="x64" (set "generator=Visual Studio 14 2015 Win64")
-  - if "%compiler_name%"=="msvc" (set "build=cmake --build . --config %variant%")
-  - if "%compiler_name%"=="msvc" (set "test=ctest -c Release -D CTEST_OUTPUT_ON_FAILURE:STRING=1")
-  # add the compiler path if needed
-  - if not "%compiler_path%"=="" (set "PATH=%PATH%;%compiler_path%")
-  # git bash conflicts with MinGW makefiles
-  - if "%generator%"=="MinGW Makefiles" (set "PATH=%PATH:C:\Program Files\Git\usr\bin;=%")
-
-build_script:
-- ps: |
-    md _build -Force
-    cd _build
-    & cmake -G "$env:generator" "-DCMAKE_BUILD_TYPE=$env:variant" "-DBUILD_SHARED_LIBS=$env:shared" ..
-    if ($LastExitCode -ne 0) {
-        throw "Exec: $ErrorMessage"
-    }
-    iex "& $env:build"
-    if ($LastExitCode -ne 0) {
-        throw "Exec: $ErrorMessage"
-    }
-
-test_script:
-- ps: |
-    iex "& $env:test"
-    if ($LastExitCode -ne 0) {
-        throw "Exec: $ErrorMessage"
-    }
-
-    function Add-CTest-Result($testResult)
-    {
-        $tests = ([xml](get-content $testResult)).Site.Testing
-        $testsCount = 0
-        $anyFailures = $FALSE
-
-        foreach ($test in $tests.test) {
-            $testsCount++
-            $testName = $test.Name
-            $testpath = $test.Path
-            $timeNode = $test.SelectSingleNode('Results/NamedMeasurement[@name="Execution Time"]/Value')
-            if ($test.status -eq "failure") {
-                $time = ([double]$timeNode.InnerText * 1000)
-                Add-AppveyorTest $testName -Outcome Failed -FileName $testpath -Duration $time -ErrorMessage $($test.results.measurement.value)
-                Add-AppveyorMessage `"$testName failed`" -Category Error
-                $anyFailures = $TRUE
-            }
-            elseif ($test.status -eq "skipped") {
-                Add-AppveyorTest $testName -Outcome Ignored -Filename $testpath
-            }
-            else {
-                $time = ([double]$timeNode.InnerText * 1000)
-                Add-AppveyorTest $testName -Outcome Passed -FileName $testpath -Duration $time -StdOut $($test.results.measurement.value)
-            }
-        }
-        return $testsCount, $anyFailures
-    }
-
-    $testsCount = 0
-    $anyFailures = $FALSE
-
-    # Run tests and upload results to AppVeyor one by one
-    Get-ChildItem ".\Testing\*.xml" -Recurse | foreach {
-        $testfile = $_.fullname
-        $count, $testsResult = Add-CTest-Result $testfile
-        Write-Host "Found $testfile with $count tests"
-        $testsCount = $testsCount + $count
-        $anyFailures = $anyFailures -or $testsResult
-    }
-
-    Write-Host "There are $testsCount tests found"
-
-    if ($anyFailures -eq $TRUE){
-        Write-Host "Failing build as there are broken tests"
-        $host.SetShouldExit(1)
-    }
-
-matrix:
-  fast_finish: true
-
-cache:
-  - C:\mingw-builds
diff --git a/cmake/AddCXXCompilerFlag.cmake b/cmake/AddCXXCompilerFlag.cmake
index 870f11a..9afde84 100644
--- a/cmake/AddCXXCompilerFlag.cmake
+++ b/cmake/AddCXXCompilerFlag.cmake
@@ -25,7 +25,7 @@
   string(REGEX REPLACE "[^A-Za-z_0-9]" "_" SANITIZED_FLAG ${SANITIZED_FLAG})
   string(REGEX REPLACE "_+" "_" SANITIZED_FLAG ${SANITIZED_FLAG})
   set(CMAKE_REQUIRED_FLAGS "${FLAG}")
-  check_cxx_compiler_flag("" ${SANITIZED_FLAG})
+  check_cxx_compiler_flag("${FLAG}" ${SANITIZED_FLAG})
   if(${SANITIZED_FLAG})
     set(VARIANT ${ARGV1})
     if(ARGV1)
diff --git a/cmake/CXXFeatureCheck.cmake b/cmake/CXXFeatureCheck.cmake
index 3059024..b106f32 100644
--- a/cmake/CXXFeatureCheck.cmake
+++ b/cmake/CXXFeatureCheck.cmake
@@ -26,7 +26,9 @@
   endif()
   message("-- Performing Test ${FEATURE}")
   try_run(RUN_${FEATURE} COMPILE_${FEATURE}
-          ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${FILE}.cpp)
+          ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${FILE}.cpp
+          CMAKE_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS}
+          LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES})
   if(RUN_${FEATURE} EQUAL 0)
     message("-- Performing Test ${FEATURE} -- success")
     set(HAVE_${VAR} 1 CACHE INTERNAL "Feature test for ${FILE}" PARENT_SCOPE)
diff --git a/include/benchmark/benchmark.h b/include/benchmark/benchmark.h
index 18aa9e6..b3b0a8e 100644
--- a/include/benchmark/benchmark.h
+++ b/include/benchmark/benchmark.h
@@ -14,8 +14,8 @@
 #ifndef BENCHMARK_BENCHMARK_H_
 #define BENCHMARK_BENCHMARK_H_
 
-#include "macros.h"
 #include "benchmark_api.h"
+#include "macros.h"
 #include "reporter.h"
 
-#endif // BENCHMARK_BENCHMARK_H_
+#endif  // BENCHMARK_BENCHMARK_H_
diff --git a/include/benchmark/benchmark_api.h b/include/benchmark/benchmark_api.h
index 664ca2a..28baa58 100644
--- a/include/benchmark/benchmark_api.h
+++ b/include/benchmark/benchmark_api.h
@@ -38,12 +38,12 @@
 // of memcpy() calls of different lengths:
 
 static void BM_memcpy(benchmark::State& state) {
-  char* src = new char[state.range_x()]; char* dst = new char[state.range_x()];
-  memset(src, 'x', state.range_x());
+  char* src = new char[state.range(0)]; char* dst = new char[state.range(0)];
+  memset(src, 'x', state.range(0));
   while (state.KeepRunning())
-    memcpy(dst, src, state.range_x());
+    memcpy(dst, src, state.range(0));
   state.SetBytesProcessed(int64_t(state.iterations()) *
-                          int64_t(state.range_x()));
+                          int64_t(state.range(0)));
   delete[] src; delete[] dst;
 }
 BENCHMARK(BM_memcpy)->Arg(8)->Arg(64)->Arg(512)->Arg(1<<10)->Arg(8<<10);
@@ -60,27 +60,27 @@
 static void BM_SetInsert(benchmark::State& state) {
   while (state.KeepRunning()) {
     state.PauseTiming();
-    set<int> data = ConstructRandomSet(state.range_x());
+    set<int> data = ConstructRandomSet(state.range(0));
     state.ResumeTiming();
-    for (int j = 0; j < state.range_y(); ++j)
+    for (int j = 0; j < state.range(1); ++j)
       data.insert(RandomNumber());
   }
 }
 BENCHMARK(BM_SetInsert)
-   ->ArgPair(1<<10, 1)
-   ->ArgPair(1<<10, 8)
-   ->ArgPair(1<<10, 64)
-   ->ArgPair(1<<10, 512)
-   ->ArgPair(8<<10, 1)
-   ->ArgPair(8<<10, 8)
-   ->ArgPair(8<<10, 64)
-   ->ArgPair(8<<10, 512);
+   ->Args({1<<10, 1})
+   ->Args({1<<10, 8})
+   ->Args({1<<10, 64})
+   ->Args({1<<10, 512})
+   ->Args({8<<10, 1})
+   ->Args({8<<10, 8})
+   ->Args({8<<10, 64})
+   ->Args({8<<10, 512});
 
 // The preceding code is quite repetitive, and can be replaced with
 // the following short-hand.  The following macro will pick a few
 // appropriate arguments in the product of the two specified ranges
 // and will generate a microbenchmark for each such pair.
-BENCHMARK(BM_SetInsert)->RangePair(1<<10, 8<<10, 1, 512);
+BENCHMARK(BM_SetInsert)->Ranges({{1<<10, 8<<10}, {1, 512}});
 
 // For more complex patterns of inputs, passing a custom function
 // to Apply allows programmatic specification of an
@@ -90,7 +90,7 @@
 static void CustomArguments(benchmark::internal::Benchmark* b) {
   for (int i = 0; i <= 10; ++i)
     for (int j = 32; j <= 1024*1024; j *= 8)
-      b->ArgPair(i, j);
+      b->Args({i, j});
 }
 BENCHMARK(BM_SetInsert)->Apply(CustomArguments);
 
@@ -101,14 +101,14 @@
   Q q;
   typename Q::value_type v;
   while (state.KeepRunning()) {
-    for (int i = state.range_x(); i--; )
+    for (int i = state.range(0); i--; )
       q.push(v);
-    for (int e = state.range_x(); e--; )
+    for (int e = state.range(0); e--; )
       q.Wait(&v);
   }
   // actually messages, not bytes:
   state.SetBytesProcessed(
-      static_cast<int64_t>(state.iterations())*state.range_x());
+      static_cast<int64_t>(state.iterations())*state.range(0));
 }
 BENCHMARK_TEMPLATE(BM_Sequential, WaitQueue<int>)->Range(1<<0, 1<<10);
 
@@ -153,8 +153,16 @@
 #include <stddef.h>
 #include <stdint.h>
 
+#include <string>
+#include <vector>
+
 #include "macros.h"
 
+#if defined(BENCHMARK_HAS_CXX11)
+#include <type_traits>
+#include <utility>
+#endif
+
 namespace benchmark {
 class BenchmarkReporter;
 
@@ -165,12 +173,17 @@
 // of each matching benchmark. Otherwise run each matching benchmark and
 // report the results.
 //
-// The second overload reports the results using the specified 'reporter'.
+// The second and third overload use the specified 'console_reporter' and
+//  'file_reporter' respectively. 'file_reporter' will write to the file
+//  specified
+//   by '--benchmark_output'. If '--benchmark_output' is not given the
+//  'file_reporter' is ignored.
 //
 // RETURNS: The number of matching benchmarks.
 size_t RunSpecifiedBenchmarks();
-size_t RunSpecifiedBenchmarks(BenchmarkReporter* reporter);
-
+size_t RunSpecifiedBenchmarks(BenchmarkReporter* console_reporter);
+size_t RunSpecifiedBenchmarks(BenchmarkReporter* console_reporter,
+                              BenchmarkReporter* file_reporter);
 
 // If this routine is called, peak memory allocation past this point in the
 // benchmark is reported at the end of the benchmark report line. (It is
@@ -184,8 +197,9 @@
 class BenchmarkImp;
 class BenchmarkFamilies;
 
-template <class T> struct Voider {
-    typedef void type;
+template <class T>
+struct Voider {
+  typedef void type;
 };
 
 template <class T, class = void>
@@ -193,7 +207,7 @@
 
 template <class T>
 struct EnableIfString<T, typename Voider<typename T::basic_string>::type> {
-    typedef int type;
+  typedef int type;
 };
 
 void UseCharPointer(char const volatile*);
@@ -202,8 +216,11 @@
 // registered benchmark.
 Benchmark* RegisterBenchmarkInternal(Benchmark*);
 
-} // end namespace internal
+// Ensure that the standard streams are properly initialized in every TU.
+int InitializeStreams();
+BENCHMARK_UNUSED static int stream_init_anchor = InitializeStreams();
 
+}  // end namespace internal
 
 // The DoNotOptimize(...) function can be used to prevent a value or
 // expression from being optimized away by the compiler. This function is
@@ -212,73 +229,64 @@
 #if defined(__GNUC__)
 template <class Tp>
 inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
-    asm volatile("" : : "g"(value) : "memory");
+  asm volatile("" : : "g"(value) : "memory");
 }
 // Force the compiler to flush pending writes to global memory. Acts as an
 // effective read/write barrier
 inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() {
-    asm volatile("" : : : "memory");
+  asm volatile("" : : : "memory");
 }
 #else
 template <class Tp>
 inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
-    internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value));
+  internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value));
 }
 // FIXME Add ClobberMemory() for non-gnu compilers
 #endif
 
 // TimeUnit is passed to a benchmark in order to specify the order of magnitude
 // for the measured time.
-enum TimeUnit {
-  kNanosecond,
-  kMicrosecond,
-  kMillisecond
-};
+enum TimeUnit { kNanosecond, kMicrosecond, kMillisecond };
 
-// BigO is passed to a benchmark in order to specify the asymptotic computational 
-// complexity for the benchmark. In case oAuto is selected, complexity will be 
+// BigO is passed to a benchmark in order to specify the asymptotic
+// computational
+// complexity for the benchmark. In case oAuto is selected, complexity will be
 // calculated automatically to the best fit.
-enum BigO {
-  oNone,
-  o1,
-  oN,
-  oNSquared,
-  oNCubed,
-  oLogN,
-  oNLogN,
-  oAuto,
-  oLambda
-};
+enum BigO { oNone, o1, oN, oNSquared, oNCubed, oLogN, oNLogN, oAuto, oLambda };
 
-// BigOFunc is passed to a benchmark in order to specify the asymptotic 
+// BigOFunc is passed to a benchmark in order to specify the asymptotic
 // computational complexity for the benchmark.
 typedef double(BigOFunc)(int);
 
+namespace internal {
+class ThreadTimer;
+class ThreadManager;
+
+#if defined(BENCHMARK_HAS_CXX11)
+enum ReportMode : unsigned {
+#else
+enum ReportMode {
+#endif
+  RM_Unspecified,  // The mode has not been manually specified
+  RM_Default,      // The mode is user-specified as default.
+  RM_ReportAggregatesOnly
+};
+}
+
 // State is passed to a running Benchmark and contains state for the
 // benchmark to use.
 class State {
-public:
-  State(size_t max_iters, bool has_x, int x, bool has_y, int y,
-        int thread_i, int n_threads);
-
+ public:
   // Returns true if the benchmark should continue through another iteration.
   // NOTE: A benchmark may not return from the test until KeepRunning() has
   // returned false.
   bool KeepRunning() {
     if (BENCHMARK_BUILTIN_EXPECT(!started_, false)) {
-      assert(!finished_);
-      started_ = true;
-      ResumeTiming();
+      StartKeepRunning();
     }
     bool const res = total_iterations_++ < max_iterations;
     if (BENCHMARK_BUILTIN_EXPECT(!res, false)) {
-      assert(started_ && (!finished_ || error_occurred_));
-      if (!error_occurred_) {
-        PauseTiming();
-      }
-      // Total iterations now is one greater than max iterations. Fix this.
-      total_iterations_ = max_iterations;
-      finished_ = true;
+      FinishKeepRunning();
     }
     return res;
   }
@@ -288,10 +296,11 @@
   // Stop the benchmark timer.  If not called, the timer will be
   // automatically stopped after KeepRunning() returns false for the first time.
   //
-  // For threaded benchmarks the PauseTiming() function acts
-  // like a barrier.  I.e., the ith call by a particular thread to this
-  // function will block until all active threads have made their ith call.
-  // The timer will stop when the last thread has called this function.
+  // For threaded benchmarks the PauseTiming() function only pauses the timing
+  // for the current thread.
+  //
+  // NOTE: The "real time" measurement is per-thread. If different threads
+  // report different measurements the largest one is reported.
   //
   // NOTE: PauseTiming()/ResumeTiming() are relatively
   // heavyweight, and so their use should generally be avoided
@@ -303,11 +312,6 @@
   // Start the benchmark timer.  The timer is NOT running on entrance to the
   // benchmark function. It begins running after the first call to KeepRunning()
   //
-  // For threaded benchmarks the ResumeTiming() function acts
-  // like a barrier.  I.e., the ith call by a particular thread to this
-  // function will block until all active threads have made their ith call.
-  // The timer will start when the last thread has called this function.
-  //
   // NOTE: PauseTiming()/ResumeTiming() are relatively
   // heavyweight, and so their use should generally be avoided
   // within each benchmark iteration, if possible.
@@ -319,10 +323,10 @@
   // thread and report an error with the specified 'msg'. After this call
   // the user may explicitly 'return' from the benchmark.
   //
-  // For threaded benchmarks only the current thread stops executing. If
-  // multiple threads report an error only the first error message is used.
-  // The current thread is no longer considered 'active' by
-  // 'PauseTiming()' and 'ResumingTiming()'.
+  // For threaded benchmarks only the current thread stops executing and future
+  // calls to `KeepRunning()` will block until all threads have completed
+  // the `KeepRunning()` loop. If multiple threads report an error only the
+  // first error message is used.
   //
   // NOTE: Calling 'SkipWithError(...)' does not cause the benchmark to exit
   // the current scope immediately. If the function is called from within
@@ -335,10 +339,8 @@
   // is used instead of automatically measured time if UseManualTime() was
   // specified.
   //
-  // For threaded benchmarks the SetIterationTime() function acts
-  // like a barrier.  I.e., the ith call by a particular thread to this
-  // function will block until all threads have made their ith call.
-  // The time will be set by the last thread to call this function.
+  // For threaded benchmarks the final value will be set to the largest
+  // reported values.
   void SetIterationTime(double seconds);
 
   // Set the number of bytes processed by the current benchmark
@@ -349,27 +351,21 @@
   //
   // REQUIRES: a benchmark has exited its KeepRunning loop.
   BENCHMARK_ALWAYS_INLINE
-  void SetBytesProcessed(size_t bytes) {
-    bytes_processed_ = bytes;
-  }
+  void SetBytesProcessed(size_t bytes) { bytes_processed_ = bytes; }
 
   BENCHMARK_ALWAYS_INLINE
-  size_t bytes_processed() const {
-    return bytes_processed_;
-  }
+  size_t bytes_processed() const { return bytes_processed_; }
 
-  // If this routine is called with complexity_n > 0 and complexity report is requested for the 
-  // family benchmark, then current benchmark will be part of the computation and complexity_n will
+  // If this routine is called with complexity_n > 0 and complexity report is
+  // requested for the
+  // family benchmark, then current benchmark will be part of the computation
+  // and complexity_n will
   // represent the length of N.
   BENCHMARK_ALWAYS_INLINE
-  void SetComplexityN(int complexity_n) {
-    complexity_n_ = complexity_n;
-  }
+  void SetComplexityN(int complexity_n) { complexity_n_ = complexity_n; }
 
   BENCHMARK_ALWAYS_INLINE
-  size_t complexity_length_n() {
-    return complexity_n_;
-  }
+  int complexity_length_n() { return complexity_n_; }
 
   // If this routine is called with items > 0, then an items/s
   // label is printed on the benchmark report line for the currently
@@ -378,14 +374,10 @@
   //
   // REQUIRES: a benchmark has exited its KeepRunning loop.
   BENCHMARK_ALWAYS_INLINE
-  void SetItemsProcessed(size_t items) {
-    items_processed_ = items;
-  }
+  void SetItemsProcessed(size_t items) { items_processed_ = items; }
 
   BENCHMARK_ALWAYS_INLINE
-  size_t items_processed() const {
-    return items_processed_;
-  }
+  size_t items_processed() const { return items_processed_; }
 
   // If this routine is called, the specified label is printed at the
   // end of the benchmark report line for the currently executing
@@ -406,56 +398,58 @@
   // has the nested typename `basic_string`. This typename should be provided
   // as an injected class name in the case of std::string.
   template <class StringType>
-  void SetLabel(StringType const & str,
+  void SetLabel(StringType const& str,
                 typename internal::EnableIfString<StringType>::type = 1) {
     this->SetLabel(str.c_str());
   }
 
   // Range arguments for this run. CHECKs if the argument has been set.
   BENCHMARK_ALWAYS_INLINE
-  int range_x() const {
-    assert(has_range_x_);
-    ((void)has_range_x_); // Prevent unused warning.
-    return range_x_;
+  int range(std::size_t pos = 0) const {
+    assert(range_.size() > pos);
+    return range_[pos];
   }
 
-  BENCHMARK_ALWAYS_INLINE
-  int range_y() const {
-    assert(has_range_y_);
-    ((void)has_range_y_); // Prevent unused warning.
-    return range_y_;
-  }
+  BENCHMARK_DEPRECATED_MSG("use 'range(0)' instead")
+  int range_x() const { return range(0); }
+
+  BENCHMARK_DEPRECATED_MSG("use 'range(1)' instead")
+  int range_y() const { return range(1); }
 
   BENCHMARK_ALWAYS_INLINE
   size_t iterations() const { return total_iterations_; }
 
-private:
+ private:
   bool started_;
   bool finished_;
   size_t total_iterations_;
 
-  bool has_range_x_;
-  int range_x_;
-
-  bool has_range_y_;
-  int range_y_;
+  std::vector<int> range_;
 
   size_t bytes_processed_;
   size_t items_processed_;
 
   int complexity_n_;
 
-public:
-  // FIXME: Make this private somehow.
   bool error_occurred_;
-public:
+
+ public:
   // Index of the executing thread. Values from [0, threads).
   const int thread_index;
   // Number of threads concurrently executing the benchmark.
   const int threads;
   const size_t max_iterations;
 
-private:
+  // TODO make me private
+  State(size_t max_iters, const std::vector<int>& ranges, int thread_i,
+        int n_threads, internal::ThreadTimer* timer,
+        internal::ThreadManager* manager);
+
+ private:
+  void StartKeepRunning();
+  void FinishKeepRunning();
+  internal::ThreadTimer* timer_;
+  internal::ThreadManager* manager_;
   BENCHMARK_DISALLOW_COPY_AND_ASSIGN(State);
 };
 
@@ -470,7 +464,7 @@
 // Each method returns "this" so that multiple method calls can
 // chained into one expression.
 class Benchmark {
-public:
+ public:
   virtual ~Benchmark();
 
   // Note: the following methods all return "this" so that multiple
@@ -489,29 +483,55 @@
   // REQUIRES: The function passed to the constructor must accept an arg1.
   Benchmark* Range(int start, int limit);
 
-  // Run this benchmark once for every value in the range [start..limit]
+  // Run this benchmark once for all values in the range [start..limit] with
+  // specific step
   // REQUIRES: The function passed to the constructor must accept an arg1.
-  Benchmark* DenseRange(int start, int limit);
+  Benchmark* DenseRange(int start, int limit, int step = 1);
 
-  // Run this benchmark once with "x,y" as the extra arguments passed
+  // Run this benchmark once with "args" as the extra arguments passed
   // to the function.
-  // REQUIRES: The function passed to the constructor must accept arg1,arg2.
-  Benchmark* ArgPair(int x, int y);
+  // REQUIRES: The function passed to the constructor must accept arg1, arg2 ...
+  Benchmark* Args(const std::vector<int>& args);
 
-  // Pick a set of values A from the range [lo1..hi1] and a set
-  // of values B from the range [lo2..hi2].  Run the benchmark for
-  // every pair of values in the cartesian product of A and B
-  // (i.e., for all combinations of the values in A and B).
-  // REQUIRES: The function passed to the constructor must accept arg1,arg2.
-  Benchmark* RangePair(int lo1, int hi1, int lo2, int hi2);
+  // Equivalent to Args({x, y})
+  // NOTE: This is a legacy C++03 interface provided for compatibility only.
+  //   New code should use 'Args'.
+  Benchmark* ArgPair(int x, int y) {
+    std::vector<int> args;
+    args.push_back(x);
+    args.push_back(y);
+    return Args(args);
+  }
+
+  // Run this benchmark once for a number of values picked from the
+  // ranges [start..limit].  (starts and limits are always picked.)
+  // REQUIRES: The function passed to the constructor must accept arg1, arg2 ...
+  Benchmark* Ranges(const std::vector<std::pair<int, int> >& ranges);
+
+  // Equivalent to ArgNames({name})
+  Benchmark* ArgName(const std::string& name);
+
+  // Set the argument names to display in the benchmark name. If not called,
+  // only argument values will be shown.
+  Benchmark* ArgNames(const std::vector<std::string>& names);
+
+  // Equivalent to Ranges({{lo1, hi1}, {lo2, hi2}}).
+  // NOTE: This is a legacy C++03 interface provided for compatibility only.
+  //   New code should use 'Ranges'.
+  Benchmark* RangePair(int lo1, int hi1, int lo2, int hi2) {
+    std::vector<std::pair<int, int> > ranges;
+    ranges.push_back(std::make_pair(lo1, hi1));
+    ranges.push_back(std::make_pair(lo2, hi2));
+    return Ranges(ranges);
+  }
 
   // Pass this benchmark object to *func, which can customize
-  // the benchmark by calling various methods like Arg, ArgPair,
+  // the benchmark by calling various methods like Arg, Args,
   // Threads, etc.
   Benchmark* Apply(void (*func)(Benchmark* benchmark));
 
-  // Set the range multiplier for non-dense range. If not called, the range multiplier 
-  // kRangeMultiplier will be used.
+  // Set the range multiplier for non-dense range. If not called, the range
+  // multiplier kRangeMultiplier will be used.
   Benchmark* RangeMultiplier(int multiplier);
 
   // Set the minimum amount of time to use when running this benchmark. This
@@ -524,6 +544,11 @@
   // REQUIRES: `n > 0`
   Benchmark* Repetitions(int n);
 
+  // Specify if each repetition of the benchmark should be reported separately
+  // or if only the final statistics should be reported. If the benchmark
+  // is not repeated then the single result is always reported.
+  Benchmark* ReportAggregatesOnly(bool v = true);
+
   // If a particular benchmark is I/O bound, runs multiple threads internally or
   // if for some reason CPU timings are not representative, call this method. If
   // called, the elapsed time will be used to control how many iterations are
@@ -531,15 +556,17 @@
   // called, the cpu time used by the benchmark will be used.
   Benchmark* UseRealTime();
 
-  // If a benchmark must measure time manually (e.g. if GPU execution time is being
-  // measured), call this method. If called, each benchmark iteration should call
+  // If a benchmark must measure time manually (e.g. if GPU execution time is
+  // being
+  // measured), call this method. If called, each benchmark iteration should
+  // call
   // SetIterationTime(seconds) to report the measured time, which will be used
   // to control how many iterations are run, and in the printing of items/second
   // or MB/second values.
   Benchmark* UseManualTime();
 
   // Set the asymptotic computational complexity for the benchmark. If called
-  // the asymptotic computational complexity will be shown on the output. 
+  // the asymptotic computational complexity will be shown on the output.
   Benchmark* Complexity(BigO complexity = benchmark::oAuto);
 
   // Set the asymptotic computational complexity for the benchmark. If called
@@ -567,6 +594,12 @@
   //    Foo in 16 threads
   Benchmark* ThreadRange(int min_threads, int max_threads);
 
+  // For each value n in the range, run this benchmark once using n threads.
+  // min_threads and max_threads are always included in the range.
+  // stride specifies the increment. E.g. DenseThreadRange(1, 8, 3) starts
+  // a benchmark with 1, 4, 7 and 8 threads.
+  Benchmark* DenseThreadRange(int min_threads, int max_threads, int stride = 1);
+
   // Equivalent to ThreadRange(NumCPUs(), NumCPUs())
   Benchmark* ThreadPerCpu();
 
@@ -575,54 +608,138 @@
   // Used inside the benchmark implementation
   struct Instance;
 
-protected:
+ protected:
   explicit Benchmark(const char* name);
   Benchmark(Benchmark const&);
   void SetName(const char* name);
 
-private:
+  int ArgsCnt() const;
+
+  static void AddRange(std::vector<int>* dst, int lo, int hi, int mult);
+
+ private:
   friend class BenchmarkFamilies;
-  BenchmarkImp* imp_;
+
+  std::string name_;
+  ReportMode report_mode_;
+  std::vector<std::string> arg_names_;   // Args for all benchmark runs
+  std::vector<std::vector<int> > args_;  // Args for all benchmark runs
+  TimeUnit time_unit_;
+  int range_multiplier_;
+  double min_time_;
+  int repetitions_;
+  bool use_real_time_;
+  bool use_manual_time_;
+  BigO complexity_;
+  BigOFunc* complexity_lambda_;
+  std::vector<int> thread_counts_;
 
   Benchmark& operator=(Benchmark const&);
 };
 
+}  // namespace internal
+
+// Create and register a benchmark with the specified 'name' that invokes
+// the specified functor 'fn'.
+//
+// RETURNS: A pointer to the registered benchmark.
+internal::Benchmark* RegisterBenchmark(const char* name,
+                                       internal::Function* fn);
+
+#if defined(BENCHMARK_HAS_CXX11)
+template <class Lambda>
+internal::Benchmark* RegisterBenchmark(const char* name, Lambda&& fn);
+#endif
+
+namespace internal {
 // The class used to hold all Benchmarks created from static function.
 // (ie those created using the BENCHMARK(...) macros.
 class FunctionBenchmark : public Benchmark {
-public:
-    FunctionBenchmark(const char* name, Function* func)
-        : Benchmark(name), func_(func)
-    {}
+ public:
+  FunctionBenchmark(const char* name, Function* func)
+      : Benchmark(name), func_(func) {}
 
-    virtual void Run(State& st);
-private:
-    Function* func_;
+  virtual void Run(State& st);
+
+ private:
+  Function* func_;
 };
 
+#ifdef BENCHMARK_HAS_CXX11
+template <class Lambda>
+class LambdaBenchmark : public Benchmark {
+ public:
+  virtual void Run(State& st) { lambda_(st); }
+
+ private:
+  template <class OLambda>
+  LambdaBenchmark(const char* name, OLambda&& lam)
+      : Benchmark(name), lambda_(std::forward<OLambda>(lam)) {}
+
+  LambdaBenchmark(LambdaBenchmark const&) = delete;
+
+ private:
+  template <class Lam>
+  friend Benchmark* ::benchmark::RegisterBenchmark(const char*, Lam&&);
+
+  Lambda lambda_;
+};
+#endif
+
 }  // end namespace internal
 
+inline internal::Benchmark* RegisterBenchmark(const char* name,
+                                              internal::Function* fn) {
+  return internal::RegisterBenchmarkInternal(
+      ::new internal::FunctionBenchmark(name, fn));
+}
+
+#ifdef BENCHMARK_HAS_CXX11
+template <class Lambda>
+internal::Benchmark* RegisterBenchmark(const char* name, Lambda&& fn) {
+  using BenchType =
+      internal::LambdaBenchmark<typename std::decay<Lambda>::type>;
+  return internal::RegisterBenchmarkInternal(
+      ::new BenchType(name, std::forward<Lambda>(fn)));
+}
+#endif
+
+#if defined(BENCHMARK_HAS_CXX11) && \
+    (!defined(BENCHMARK_GCC_VERSION) || BENCHMARK_GCC_VERSION >= 409)
+template <class Lambda, class... Args>
+internal::Benchmark* RegisterBenchmark(const char* name, Lambda&& fn,
+                                       Args&&... args) {
+  return benchmark::RegisterBenchmark(
+      name, [=](benchmark::State& st) { fn(st, args...); });
+}
+#else
+#define BENCHMARK_HAS_NO_VARIADIC_REGISTER_BENCHMARK
+#endif
+
 // The base class for all fixture tests.
-class Fixture: public internal::Benchmark {
-public:
-    Fixture() : internal::Benchmark("") {}
+class Fixture : public internal::Benchmark {
+ public:
+  Fixture() : internal::Benchmark("") {}
 
-    virtual void Run(State& st) {
-      this->SetUp(st);
-      this->BenchmarkCase(st);
-      this->TearDown(st);
-    }
+  virtual void Run(State& st) {
+    this->SetUp(st);
+    this->BenchmarkCase(st);
+    this->TearDown(st);
+  }
 
-    virtual void SetUp(const State&) {}
-    virtual void TearDown(const State&) {}
+  // These will be deprecated ...
+  virtual void SetUp(const State&) {}
+  virtual void TearDown(const State&) {}
+  // ... In favor of these.
+  virtual void SetUp(State& st) { SetUp(const_cast<const State&>(st)); }
+  virtual void TearDown(State& st) { TearDown(const_cast<const State&>(st)); }
 
-protected:
-    virtual void BenchmarkCase(State&) = 0;
+ protected:
+  virtual void BenchmarkCase(State&) = 0;
 };
 
 }  // end namespace benchmark
 
-
 // ------------------------------------------------------
 // Macro to register benchmarks
 
@@ -637,26 +754,26 @@
 
 // Helpers for generating unique variable names
 #define BENCHMARK_PRIVATE_NAME(n) \
-    BENCHMARK_PRIVATE_CONCAT(_benchmark_, BENCHMARK_PRIVATE_UNIQUE_ID, n)
+  BENCHMARK_PRIVATE_CONCAT(_benchmark_, BENCHMARK_PRIVATE_UNIQUE_ID, n)
 #define BENCHMARK_PRIVATE_CONCAT(a, b, c) BENCHMARK_PRIVATE_CONCAT2(a, b, c)
 #define BENCHMARK_PRIVATE_CONCAT2(a, b, c) a##b##c
 
-#define BENCHMARK_PRIVATE_DECLARE(n)       \
-  static ::benchmark::internal::Benchmark* \
-  BENCHMARK_PRIVATE_NAME(n) BENCHMARK_UNUSED
+#define BENCHMARK_PRIVATE_DECLARE(n)                                 \
+  static ::benchmark::internal::Benchmark* BENCHMARK_PRIVATE_NAME(n) \
+      BENCHMARK_UNUSED
 
-#define BENCHMARK(n) \
-    BENCHMARK_PRIVATE_DECLARE(n) =                               \
-        (::benchmark::internal::RegisterBenchmarkInternal(       \
-            new ::benchmark::internal::FunctionBenchmark(#n, n)))
+#define BENCHMARK(n)                                     \
+  BENCHMARK_PRIVATE_DECLARE(n) =                         \
+      (::benchmark::internal::RegisterBenchmarkInternal( \
+          new ::benchmark::internal::FunctionBenchmark(#n, n)))
 
 // Old-style macros
 #define BENCHMARK_WITH_ARG(n, a) BENCHMARK(n)->Arg((a))
-#define BENCHMARK_WITH_ARG2(n, a1, a2) BENCHMARK(n)->ArgPair((a1), (a2))
+#define BENCHMARK_WITH_ARG2(n, a1, a2) BENCHMARK(n)->Args({(a1), (a2)})
 #define BENCHMARK_WITH_UNIT(n, t) BENCHMARK(n)->Unit((t))
 #define BENCHMARK_RANGE(n, lo, hi) BENCHMARK(n)->Range((lo), (hi))
 #define BENCHMARK_RANGE2(n, l1, h1, l2, h2) \
-  BENCHMARK(n)->RangePair((l1), (h1), (l2), (h2))
+  BENCHMARK(n)->RangePair({{(l1), (h1)}, {(l2), (h2)}})
 
 #if __cplusplus >= 201103L
 
@@ -671,14 +788,14 @@
 //}
 // /* Registers a benchmark named "BM_takes_args/int_string_test` */
 // BENCHMARK_CAPTURE(BM_takes_args, int_string_test, 42, std::string("abc"));
-#define BENCHMARK_CAPTURE(func, test_case_name, ...)                       \
-    BENCHMARK_PRIVATE_DECLARE(func) =                                      \
-        (::benchmark::internal::RegisterBenchmarkInternal(                 \
-            new ::benchmark::internal::FunctionBenchmark(                  \
-                    #func "/" #test_case_name,                             \
-                    [](::benchmark::State& st) { func(st, __VA_ARGS__); })))
+#define BENCHMARK_CAPTURE(func, test_case_name, ...)     \
+  BENCHMARK_PRIVATE_DECLARE(func) =                      \
+      (::benchmark::internal::RegisterBenchmarkInternal( \
+          new ::benchmark::internal::FunctionBenchmark(  \
+              #func "/" #test_case_name,                 \
+              [](::benchmark::State& st) { func(st, __VA_ARGS__); })))
 
-#endif // __cplusplus >= 11
+#endif  // __cplusplus >= 11
 
 // This will register a benchmark for a templatized function.  For example:
 //
@@ -688,54 +805,54 @@
 // BENCHMARK_TEMPLATE(BM_Foo, 1);
 //
 // will register BM_Foo<1> as a benchmark.
-#define BENCHMARK_TEMPLATE1(n, a) \
-  BENCHMARK_PRIVATE_DECLARE(n) =  \
-      (::benchmark::internal::RegisterBenchmarkInternal( \
-        new ::benchmark::internal::FunctionBenchmark(#n "<" #a ">", n<a>)))
-
-#define BENCHMARK_TEMPLATE2(n, a, b)                     \
+#define BENCHMARK_TEMPLATE1(n, a)                        \
   BENCHMARK_PRIVATE_DECLARE(n) =                         \
       (::benchmark::internal::RegisterBenchmarkInternal( \
-        new ::benchmark::internal::FunctionBenchmark(    \
-            #n "<" #a "," #b ">", n<a, b>)))
+          new ::benchmark::internal::FunctionBenchmark(#n "<" #a ">", n<a>)))
+
+#define BENCHMARK_TEMPLATE2(n, a, b)                                         \
+  BENCHMARK_PRIVATE_DECLARE(n) =                                             \
+      (::benchmark::internal::RegisterBenchmarkInternal(                     \
+          new ::benchmark::internal::FunctionBenchmark(#n "<" #a "," #b ">", \
+                                                       n<a, b>)))
 
 #if __cplusplus >= 201103L
-#define BENCHMARK_TEMPLATE(n, ...)           \
-  BENCHMARK_PRIVATE_DECLARE(n) =             \
+#define BENCHMARK_TEMPLATE(n, ...)                       \
+  BENCHMARK_PRIVATE_DECLARE(n) =                         \
       (::benchmark::internal::RegisterBenchmarkInternal( \
-        new ::benchmark::internal::FunctionBenchmark( \
-        #n "<" #__VA_ARGS__ ">", n<__VA_ARGS__>)))
+          new ::benchmark::internal::FunctionBenchmark(  \
+              #n "<" #__VA_ARGS__ ">", n<__VA_ARGS__>)))
 #else
 #define BENCHMARK_TEMPLATE(n, a) BENCHMARK_TEMPLATE1(n, a)
 #endif
 
+#define BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method)        \
+  class BaseClass##_##Method##_Benchmark : public BaseClass { \
+   public:                                                    \
+    BaseClass##_##Method##_Benchmark() : BaseClass() {        \
+      this->SetName(#BaseClass "/" #Method);                  \
+    }                                                         \
+                                                              \
+   protected:                                                 \
+    virtual void BenchmarkCase(::benchmark::State&);          \
+  };
 
-#define BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method)      \
-class BaseClass##_##Method##_Benchmark : public BaseClass { \
-public:                                                     \
-    BaseClass##_##Method##_Benchmark() : BaseClass() {      \
-        this->SetName(#BaseClass "/" #Method);}             \
-protected:                                                  \
-    virtual void BenchmarkCase(::benchmark::State&);        \
-};
-
-#define BENCHMARK_DEFINE_F(BaseClass, Method) \
-    BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method) \
-    void BaseClass##_##Method##_Benchmark::BenchmarkCase
+#define BENCHMARK_DEFINE_F(BaseClass, Method)    \
+  BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method) \
+  void BaseClass##_##Method##_Benchmark::BenchmarkCase
 
 #define BENCHMARK_REGISTER_F(BaseClass, Method) \
-    BENCHMARK_PRIVATE_REGISTER_F(BaseClass##_##Method##_Benchmark)
+  BENCHMARK_PRIVATE_REGISTER_F(BaseClass##_##Method##_Benchmark)
 
 #define BENCHMARK_PRIVATE_REGISTER_F(TestName) \
-    BENCHMARK_PRIVATE_DECLARE(TestName) = \
-        (::benchmark::internal::RegisterBenchmarkInternal(new TestName()))
+  BENCHMARK_PRIVATE_DECLARE(TestName) =        \
+      (::benchmark::internal::RegisterBenchmarkInternal(new TestName()))
 
 // This macro will define and register a benchmark within a fixture class.
-#define BENCHMARK_F(BaseClass, Method) \
-    BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method) \
-    BENCHMARK_REGISTER_F(BaseClass, Method); \
-    void BaseClass##_##Method##_Benchmark::BenchmarkCase
-
+#define BENCHMARK_F(BaseClass, Method)           \
+  BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method) \
+  BENCHMARK_REGISTER_F(BaseClass, Method);       \
+  void BaseClass##_##Method##_Benchmark::BenchmarkCase
 
 // Helper macro to create a main routine in a test that runs the benchmarks
 #define BENCHMARK_MAIN()                   \
diff --git a/include/benchmark/macros.h b/include/benchmark/macros.h
index 09d13c1..2466fd3 100644
--- a/include/benchmark/macros.h
+++ b/include/benchmark/macros.h
@@ -14,43 +14,53 @@
 #ifndef BENCHMARK_MACROS_H_
 #define BENCHMARK_MACROS_H_
 
-#if __cplusplus < 201103L
-# define BENCHMARK_DISALLOW_COPY_AND_ASSIGN(TypeName)  \
-    TypeName(const TypeName&);                         \
-    TypeName& operator=(const TypeName&)
+#if __cplusplus >= 201103L
+#define BENCHMARK_HAS_CXX11
+#endif
+
+#ifndef BENCHMARK_HAS_CXX11
+#define BENCHMARK_DISALLOW_COPY_AND_ASSIGN(TypeName) \
+  TypeName(const TypeName&);                         \
+  TypeName& operator=(const TypeName&)
 #else
-# define BENCHMARK_DISALLOW_COPY_AND_ASSIGN(TypeName)  \
-    TypeName(const TypeName&) = delete;                \
-    TypeName& operator=(const TypeName&) = delete
+#define BENCHMARK_DISALLOW_COPY_AND_ASSIGN(TypeName) \
+  TypeName(const TypeName&) = delete;                \
+  TypeName& operator=(const TypeName&) = delete
 #endif
 
 #if defined(__GNUC__)
-# define BENCHMARK_UNUSED __attribute__((unused))
-# define BENCHMARK_ALWAYS_INLINE __attribute__((always_inline))
-# define BENCHMARK_NOEXCEPT noexcept
-# define BENCHMARK_NOEXCEPT_OP(x) noexcept(x)
+#define BENCHMARK_UNUSED __attribute__((unused))
+#define BENCHMARK_ALWAYS_INLINE __attribute__((always_inline))
+#define BENCHMARK_NOEXCEPT noexcept
+#define BENCHMARK_NOEXCEPT_OP(x) noexcept(x)
 #elif defined(_MSC_VER) && !defined(__clang__)
-# define BENCHMARK_UNUSED
-# define BENCHMARK_ALWAYS_INLINE __forceinline
-# if _MSC_VER >= 1900
-#  define BENCHMARK_NOEXCEPT noexcept
-#  define BENCHMARK_NOEXCEPT_OP(x) noexcept(x)
-# else
-#  define BENCHMARK_NOEXCEPT
-#  define BENCHMARK_NOEXCEPT_OP(x)
-# endif
-# define __func__ __FUNCTION__
+#define BENCHMARK_UNUSED
+#define BENCHMARK_ALWAYS_INLINE __forceinline
+#if _MSC_VER >= 1900
+#define BENCHMARK_NOEXCEPT noexcept
+#define BENCHMARK_NOEXCEPT_OP(x) noexcept(x)
 #else
-# define BENCHMARK_UNUSED
-# define BENCHMARK_ALWAYS_INLINE
-# define BENCHMARK_NOEXCEPT
-# define BENCHMARK_NOEXCEPT_OP(x)
+#define BENCHMARK_NOEXCEPT
+#define BENCHMARK_NOEXCEPT_OP(x)
+#endif
+#define __func__ __FUNCTION__
+#else
+#define BENCHMARK_UNUSED
+#define BENCHMARK_ALWAYS_INLINE
+#define BENCHMARK_NOEXCEPT
+#define BENCHMARK_NOEXCEPT_OP(x)
 #endif
 
 #if defined(__GNUC__)
-# define BENCHMARK_BUILTIN_EXPECT(x, y) __builtin_expect(x, y)
+#define BENCHMARK_BUILTIN_EXPECT(x, y) __builtin_expect(x, y)
+#define BENCHMARK_DEPRECATED_MSG(msg) __attribute__((deprecated(msg)))
 #else
-# define BENCHMARK_BUILTIN_EXPECT(x, y) x
+#define BENCHMARK_BUILTIN_EXPECT(x, y) x
+#define BENCHMARK_DEPRECATED_MSG(msg)
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+#define BENCHMARK_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
 #endif
 
 #endif  // BENCHMARK_MACROS_H_
diff --git a/include/benchmark/reporter.h b/include/benchmark/reporter.h
index 22c97a0..8c39e7f 100644
--- a/include/benchmark/reporter.h
+++ b/include/benchmark/reporter.h
@@ -41,19 +41,20 @@
   };
 
   struct Run {
-    Run() :
-      error_occurred(false),
-      iterations(1),
-      time_unit(kNanosecond),
-      real_accumulated_time(0),
-      cpu_accumulated_time(0),
-      bytes_per_second(0),
-      items_per_second(0),
-      max_heapbytes_used(0),
-      complexity(oNone),
-      complexity_n(0),
-      report_big_o(false),
-      report_rms(false) {}
+    Run()
+        : error_occurred(false),
+          iterations(1),
+          time_unit(kNanosecond),
+          real_accumulated_time(0),
+          cpu_accumulated_time(0),
+          bytes_per_second(0),
+          items_per_second(0),
+          max_heapbytes_used(0),
+          complexity(oNone),
+          complexity_lambda(),
+          complexity_n(0),
+          report_big_o(false),
+          report_rms(false) {}
 
     std::string benchmark_name;
     std::string report_label;  // Empty if not set by benchmark.
@@ -133,13 +134,9 @@
     error_stream_ = err;
   }
 
-  std::ostream& GetOutputStream() const {
-    return *output_stream_;
-  }
+  std::ostream& GetOutputStream() const { return *output_stream_; }
 
-  std::ostream& GetErrorStream() const {
-    return *error_stream_;
-  }
+  std::ostream& GetErrorStream() const { return *error_stream_; }
 
   virtual ~BenchmarkReporter();
 
@@ -157,13 +154,19 @@
 // default reporter used by RunSpecifiedBenchmarks().
 class ConsoleReporter : public BenchmarkReporter {
  public:
+  enum OutputOptions { OO_None, OO_Color };
+  explicit ConsoleReporter(OutputOptions color_output = OO_Color)
+      : name_field_width_(0), color_output_(color_output == OO_Color) {}
+
   virtual bool ReportContext(const Context& context);
   virtual void ReportRuns(const std::vector<Run>& reports);
 
  protected:
   virtual void PrintRunData(const Run& report);
-
   size_t name_field_width_;
+
+ private:
+  bool color_output_;
 };
 
 class JSONReporter : public BenchmarkReporter {
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 6dab64b..4038875 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,25 +1,18 @@
 # Allow the source files to find headers in src/
 include_directories(${PROJECT_SOURCE_DIR}/src)
 
-# Define the source files
-set(SOURCE_FILES "benchmark.cc" "colorprint.cc" "commandlineflags.cc"
-                 "console_reporter.cc" "csv_reporter.cc" "json_reporter.cc"
-                 "log.cc" "reporter.cc" "sleep.cc" "string_util.cc"
-                 "sysinfo.cc" "walltime.cc" "complexity.cc")
-# Determine the correct regular expression engine to use
-if(HAVE_STD_REGEX)
-  set(RE_FILES "re_std.cc")
-elseif(HAVE_GNU_POSIX_REGEX)
-  set(RE_FILES "re_posix.cc")
-elseif(HAVE_POSIX_REGEX)
-  set(RE_FILES "re_posix.cc")
-else()
-  message(FATAL_ERROR "Failed to determine the source files for the regular expression backend")
+if (DEFINED BENCHMARK_CXX_LINKER_FLAGS)
+  list(APPEND CMAKE_SHARED_LINKER_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS})
+  list(APPEND CMAKE_MODULE_LINKER_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS})
 endif()
 
-add_library(benchmark ${SOURCE_FILES} ${RE_FILES})
+file(GLOB
+  SOURCE_FILES
+    *.cc
+    ${PROJECT_SOURCE_DIR}/include/benchmark/*.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
 
-
+add_library(benchmark ${SOURCE_FILES})
 set_target_properties(benchmark PROPERTIES
   OUTPUT_NAME "benchmark"
   VERSION ${GENERIC_LIB_VERSION}
@@ -27,7 +20,7 @@
 )
 
 # Link threads.
-target_link_libraries(benchmark ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(benchmark  ${BENCHMARK_CXX_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
 
 # We need extra libraries on Windows
 if(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
diff --git a/src/arraysize.h b/src/arraysize.h
index 638a52a..51a50f2 100644
--- a/src/arraysize.h
+++ b/src/arraysize.h
@@ -11,7 +11,6 @@
 // a pointer by mistake, you will get a compile-time error.
 //
 
-
 // This template function declaration is used in defining arraysize.
 // Note that the function doesn't need an implementation, as we only
 // use its type.
@@ -28,7 +27,7 @@
 
 #define arraysize(array) (sizeof(::benchmark::internal::ArraySizeHelper(array)))
 
-} // end namespace internal
-} // end namespace benchmark
+}  // end namespace internal
+}  // end namespace benchmark
 
-#endif // BENCHMARK_ARRAYSIZE_H_
+#endif  // BENCHMARK_ARRAYSIZE_H_
diff --git a/src/benchmark.cc b/src/benchmark.cc
index cb8e132..95f6a25 100644
--- a/src/benchmark.cc
+++ b/src/benchmark.cc
@@ -13,25 +13,28 @@
 // limitations under the License.
 
 #include "benchmark/benchmark.h"
+#include "benchmark_api_internal.h"
 #include "internal_macros.h"
 
 #ifndef BENCHMARK_OS_WINDOWS
-#include <sys/time.h>
 #include <sys/resource.h>
+#include <sys/time.h>
 #include <unistd.h>
 #endif
 
-#include <cstdlib>
-#include <cstring>
-#include <cstdio>
 #include <algorithm>
 #include <atomic>
 #include <condition_variable>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
 #include <iostream>
 #include <memory>
 #include <thread>
 
 #include "check.h"
+#include "colorprint.h"
 #include "commandlineflags.h"
 #include "complexity.h"
 #include "log.h"
@@ -40,7 +43,7 @@
 #include "stat.h"
 #include "string_util.h"
 #include "sysinfo.h"
-#include "walltime.h"
+#include "timers.h"
 
 DEFINE_bool(benchmark_list_tests, false,
             "Print a list of benchmarks. This option overrides all other "
@@ -65,816 +68,273 @@
              "The number of runs of each benchmark. If greater than 1, the "
              "mean and standard deviation of the runs will be reported.");
 
+DEFINE_bool(benchmark_report_aggregates_only, false,
+            "Report the result of each benchmark repetitions. When 'true' is "
+            "specified only the mean, standard deviation, and other statistics "
+            "are reported for repeated benchmarks.");
+
 DEFINE_string(benchmark_format, "console",
               "The format to use for console output. Valid values are "
               "'console', 'json', or 'csv'.");
 
-DEFINE_bool(color_print, true, "Enables colorized logging.");
+DEFINE_string(benchmark_out_format, "json",
+              "The format to use for file output. Valid values are "
+              "'console', 'json', or 'csv'.");
+
+DEFINE_string(benchmark_out, "", "The file to write additonal output to");
+
+DEFINE_string(benchmark_color, "auto",
+              "Whether to use colors in the output.  Valid values: "
+              "'true'/'yes'/1, 'false'/'no'/0, and 'auto'. 'auto' means to use "
+              "colors if the output is being sent to a terminal and the TERM "
+              "environment variable is set to a terminal type that supports "
+              "colors.");
 
 DEFINE_int32(v, 0, "The level of verbose logging to output");
 
-
 namespace benchmark {
-
 namespace internal {
 
 void UseCharPointer(char const volatile*) {}
 
-// NOTE: This is a dummy "mutex" type used to denote the actual mutex
-// returned by GetBenchmarkLock(). This is only used to placate the thread
-// safety warnings by giving the return of GetBenchmarkLock() a name.
-struct CAPABILITY("mutex") BenchmarkLockType {};
-BenchmarkLockType BenchmarkLockVar;
-
-} // end namespace internal
-
-inline Mutex& RETURN_CAPABILITY(::benchmark::internal::BenchmarkLockVar)
-GetBenchmarkLock()
-{
-  static Mutex lock;
-  return lock;
-}
+}  // end namespace internal
 
 namespace {
 
-bool IsZero(double n) {
-    return std::abs(n) < std::numeric_limits<double>::epsilon();
-}
-
-// For non-dense Range, intermediate values are powers of kRangeMultiplier.
-static const int kRangeMultiplier = 8;
 static const size_t kMaxIterations = 1000000000;
 
-bool running_benchmark = false;
+}  // end namespace
 
-// Global variable so that a benchmark can cause a little extra printing
-std::string* GetReportLabel() {
-    static std::string label GUARDED_BY(GetBenchmarkLock());
-    return &label;
-}
+namespace internal {
 
-// Global variable so that a benchmark can report an error as a human readable
-// string. If error_message is null no error occurred.
-#if defined(_MSC_VER) && _MSC_VER <= 1800
-typedef char* error_message_type;
-#else
-typedef const char* error_message_type;
-#endif
+class ThreadManager {
+ public:
+  ThreadManager(int num_threads)
+      : alive_threads_(num_threads), start_stop_barrier_(num_threads) {}
 
-static std::atomic<error_message_type> error_message = ATOMIC_VAR_INIT(nullptr);
+  Mutex& GetBenchmarkMutex() const RETURN_CAPABILITY(benchmark_mutex_) {
+    return benchmark_mutex_;
+  }
 
-// TODO(ericwf): support MallocCounter.
-//static benchmark::MallocCounter *benchmark_mc;
+  bool StartStopBarrier() EXCLUDES(end_cond_mutex_) {
+    return start_stop_barrier_.wait();
+  }
 
-struct ThreadStats {
-    ThreadStats() : bytes_processed(0), items_processed(0), complexity_n(0) {}
-    int64_t bytes_processed;
-    int64_t items_processed;
-    int  complexity_n;
+  void NotifyThreadComplete() EXCLUDES(end_cond_mutex_) {
+    start_stop_barrier_.removeThread();
+    if (--alive_threads_ == 0) {
+      MutexLock lock(end_cond_mutex_);
+      end_condition_.notify_all();
+    }
+  }
+
+  void WaitForAllThreads() EXCLUDES(end_cond_mutex_) {
+    MutexLock lock(end_cond_mutex_);
+    end_condition_.wait(lock.native_handle(),
+                        [this]() { return alive_threads_ == 0; });
+  }
+
+ public:
+  struct Result {
+    double real_time_used = 0;
+    double cpu_time_used = 0;
+    double manual_time_used = 0;
+    int64_t bytes_processed = 0;
+    int64_t items_processed = 0;
+    int complexity_n = 0;
+    std::string report_label_;
+    std::string error_message_;
+    bool has_error_ = false;
+  };
+  GUARDED_BY(GetBenchmarkMutex()) Result results;
+
+ private:
+  mutable Mutex benchmark_mutex_;
+  std::atomic<int> alive_threads_;
+  Barrier start_stop_barrier_;
+  Mutex end_cond_mutex_;
+  Condition end_condition_;
 };
 
 // Timer management class
-class TimerManager {
+class ThreadTimer {
  public:
-  TimerManager(int num_threads, Notification* done)
-      : num_threads_(num_threads),
-        running_threads_(num_threads),
-        done_(done),
-        running_(false),
-        real_time_used_(0),
-        cpu_time_used_(0),
-        manual_time_used_(0),
-        num_finalized_(0),
-        phase_number_(0),
-        entered_(0)
-  {
+  ThreadTimer() = default;
+
+  // Called by each thread
+  void StartTimer() {
+    running_ = true;
+    start_real_time_ = ChronoClockNow();
+    start_cpu_time_ = ThreadCPUUsage();
   }
 
   // Called by each thread
-  void StartTimer() EXCLUDES(lock_) {
-    bool last_thread = false;
-    {
-      MutexLock ml(lock_);
-      last_thread = Barrier(ml);
-      if (last_thread) {
-        CHECK(!running_) << "Called StartTimer when timer is already running";
-        running_ = true;
-        start_real_time_ = walltime::Now();
-        start_cpu_time_ = MyCPUUsage() + ChildrenCPUUsage();
-       }
-     }
-     if (last_thread) {
-       phase_condition_.notify_all();
-     }
+  void StopTimer() {
+    CHECK(running_);
+    running_ = false;
+    real_time_used_ += ChronoClockNow() - start_real_time_;
+    cpu_time_used_ += ThreadCPUUsage() - start_cpu_time_;
   }
 
   // Called by each thread
-  void StopTimer() EXCLUDES(lock_) {
-    bool last_thread = false;
-    {
-      MutexLock ml(lock_);
-      last_thread = Barrier(ml);
-      if (last_thread) {
-        CHECK(running_) << "Called StopTimer when timer is already stopped";
-        InternalStop();
-      }
-    }
-    if (last_thread) {
-      phase_condition_.notify_all();
-    }
-  }
+  void SetIterationTime(double seconds) { manual_time_used_ += seconds; }
 
-  // Called by each thread
-  void SetIterationTime(double seconds) EXCLUDES(lock_) {
-    bool last_thread = false;
-    {
-      MutexLock ml(lock_);
-      last_thread = Barrier(ml);
-      if (last_thread) {
-        manual_time_used_ += seconds;
-      }
-    }
-    if (last_thread) {
-      phase_condition_.notify_all();
-    }
-  }
-
-  // Called by each thread
-  void Finalize() EXCLUDES(lock_) {
-    MutexLock l(lock_);
-    num_finalized_++;
-    if (num_finalized_ == num_threads_) {
-      CHECK(!running_) <<
-        "The timer should be stopped before the timer is finalized";
-      done_->Notify();
-    }
-  }
-
-  void RemoveErroredThread() EXCLUDES(lock_) {
-    MutexLock ml(lock_);
-    int last_thread = --running_threads_ == 0;
-    if (last_thread && running_)
-      InternalStop();
-    else if (!last_thread)
-      phase_condition_.notify_all();
-  }
+  bool running() const { return running_; }
 
   // REQUIRES: timer is not running
-  double real_time_used() EXCLUDES(lock_) {
-    MutexLock l(lock_);
+  double real_time_used() {
     CHECK(!running_);
     return real_time_used_;
   }
 
   // REQUIRES: timer is not running
-  double cpu_time_used() EXCLUDES(lock_) {
-    MutexLock l(lock_);
+  double cpu_time_used() {
     CHECK(!running_);
     return cpu_time_used_;
   }
 
   // REQUIRES: timer is not running
-  double manual_time_used() EXCLUDES(lock_) {
-    MutexLock l(lock_);
+  double manual_time_used() {
     CHECK(!running_);
     return manual_time_used_;
   }
 
  private:
-  Mutex lock_;
-  Condition phase_condition_;
-  int num_threads_;
-  int running_threads_;
-  Notification* done_;
-
-  bool running_;                // Is the timer running
-  double start_real_time_;      // If running_
-  double start_cpu_time_;       // If running_
+  bool running_ = false;        // Is the timer running
+  double start_real_time_ = 0;  // If running_
+  double start_cpu_time_ = 0;   // If running_
 
   // Accumulated time so far (does not contain current slice if running_)
-  double real_time_used_;
-  double cpu_time_used_;
+  double real_time_used_ = 0;
+  double cpu_time_used_ = 0;
   // Manually set iteration time. User sets this with SetIterationTime(seconds).
-  double manual_time_used_;
-
-  // How many threads have called Finalize()
-  int num_finalized_;
-
-  // State for barrier management
-  int phase_number_;
-  int entered_;         // Number of threads that have entered this barrier
-
-  void InternalStop() REQUIRES(lock_) {
-    CHECK(running_);
-    running_ = false;
-    real_time_used_ += walltime::Now() - start_real_time_;
-    cpu_time_used_ += ((MyCPUUsage() + ChildrenCPUUsage())
-                       - start_cpu_time_);
-  }
-
-  // Enter the barrier and wait until all other threads have also
-  // entered the barrier.  Returns iff this is the last thread to
-  // enter the barrier.
-  bool Barrier(MutexLock& ml) REQUIRES(lock_) {
-    CHECK_LT(entered_, running_threads_);
-    entered_++;
-    if (entered_ < running_threads_) {
-      // Wait for all threads to enter
-      int phase_number_cp = phase_number_;
-      auto cb = [this, phase_number_cp]() {
-        return this->phase_number_ > phase_number_cp ||
-               entered_ == running_threads_; // A thread has aborted in error
-      };
-      phase_condition_.wait(ml.native_handle(), cb);
-      if (phase_number_ > phase_number_cp)
-        return false;
-      // else (running_threads_ == entered_) and we are the last thread.
-    }
-    // Last thread has reached the barrier
-    phase_number_++;
-    entered_ = 0;
-    return true;
-  }
+  double manual_time_used_ = 0;
 };
 
-// TimerManager for current run.
-static std::unique_ptr<TimerManager> timer_manager = nullptr;
-
-} // end namespace
-
-namespace internal {
-
-// Information kept per benchmark we may want to run
-struct Benchmark::Instance {
-  std::string    name;
-  Benchmark*     benchmark;
-  bool           has_arg1;
-  int            arg1;
-  bool           has_arg2;
-  int            arg2;
-  TimeUnit       time_unit;
-  int            range_multiplier;
-  bool           use_real_time;
-  bool           use_manual_time;
-  BigO           complexity;
-  BigOFunc*      complexity_lambda;
-  bool           last_benchmark_instance;
-  int            repetitions;
-  double         min_time;
-  int            threads;    // Number of concurrent threads to use
-  bool           multithreaded;  // Is benchmark multi-threaded?
-};
-
-// Class for managing registered benchmarks.  Note that each registered
-// benchmark identifies a family of related benchmarks to run.
-class BenchmarkFamilies {
- public:
-  static BenchmarkFamilies* GetInstance();
-
-  // Registers a benchmark family and returns the index assigned to it.
-  size_t AddBenchmark(std::unique_ptr<Benchmark> family);
-
-  // Extract the list of benchmark instances that match the specified
-  // regular expression.
-  bool FindBenchmarks(const std::string& re,
-                      std::vector<Benchmark::Instance>* benchmarks);
- private:
-  BenchmarkFamilies() {}
-
-  std::vector<std::unique_ptr<Benchmark>> families_;
-  Mutex mutex_;
-};
-
-
-class BenchmarkImp {
-public:
-  explicit BenchmarkImp(const char* name);
-  ~BenchmarkImp();
-
-  void Arg(int x);
-  void Unit(TimeUnit unit);
-  void Range(int start, int limit);
-  void DenseRange(int start, int limit);
-  void ArgPair(int start, int limit);
-  void RangePair(int lo1, int hi1, int lo2, int hi2);
-  void RangeMultiplier(int multiplier);
-  void MinTime(double n);
-  void Repetitions(int n);
-  void UseRealTime();
-  void UseManualTime();
-  void Complexity(BigO complexity);
-  void ComplexityLambda(BigOFunc* complexity);
-  void Threads(int t);
-  void ThreadRange(int min_threads, int max_threads);
-  void ThreadPerCpu();
-  void SetName(const char* name);
-
-  static void AddRange(std::vector<int>* dst, int lo, int hi, int mult);
-
-private:
-  friend class BenchmarkFamilies;
-
-  std::string name_;
-  int arg_count_;
-  std::vector< std::pair<int, int> > args_;  // Args for all benchmark runs
-  TimeUnit time_unit_;
-  int range_multiplier_;
-  double min_time_;
-  int repetitions_;
-  bool use_real_time_;
-  bool use_manual_time_;
-  BigO complexity_;
-  BigOFunc* complexity_lambda_;
-  std::vector<int> thread_counts_;
-
-  BenchmarkImp& operator=(BenchmarkImp const&);
-};
-
-BenchmarkFamilies* BenchmarkFamilies::GetInstance() {
-  static BenchmarkFamilies instance;
-  return &instance;
-}
-
-
-size_t BenchmarkFamilies::AddBenchmark(std::unique_ptr<Benchmark> family) {
-  MutexLock l(mutex_);
-  size_t index = families_.size();
-  families_.push_back(std::move(family));
-  return index;
-}
-
-bool BenchmarkFamilies::FindBenchmarks(
-    const std::string& spec,
-    std::vector<Benchmark::Instance>* benchmarks) {
-  // Make regular expression out of command-line flag
-  std::string error_msg;
-  Regex re;
-  if (!re.Init(spec, &error_msg)) {
-    std::cerr << "Could not compile benchmark re: " << error_msg << std::endl;
-    return false;
-  }
-
-  // Special list of thread counts to use when none are specified
-  std::vector<int> one_thread;
-  one_thread.push_back(1);
-
-  MutexLock l(mutex_);
-  for (std::unique_ptr<Benchmark>& bench_family : families_) {
-    // Family was deleted or benchmark doesn't match
-    if (!bench_family) continue;
-    BenchmarkImp* family = bench_family->imp_;
-
-    if (family->arg_count_ == -1) {
-      family->arg_count_ = 0;
-      family->args_.emplace_back(-1, -1);
-    }
-    for (auto const& args : family->args_) {
-      const std::vector<int>* thread_counts =
-        (family->thread_counts_.empty()
-         ? &one_thread
-         : &family->thread_counts_);
-      for (int num_threads : *thread_counts) {
-
-        Benchmark::Instance instance;
-        instance.name = family->name_;
-        instance.benchmark = bench_family.get();
-        instance.has_arg1 = family->arg_count_ >= 1;
-        instance.arg1 = args.first;
-        instance.has_arg2 = family->arg_count_ == 2;
-        instance.arg2 = args.second;
-        instance.time_unit = family->time_unit_;
-        instance.range_multiplier = family->range_multiplier_;
-        instance.min_time = family->min_time_;
-        instance.repetitions = family->repetitions_;
-        instance.use_real_time = family->use_real_time_;
-        instance.use_manual_time = family->use_manual_time_;
-        instance.complexity = family->complexity_;
-        instance.complexity_lambda = family->complexity_lambda_;
-        instance.threads = num_threads;
-        instance.multithreaded = !(family->thread_counts_.empty());
-
-        // Add arguments to instance name
-        if (family->arg_count_ >= 1) {
-          AppendHumanReadable(instance.arg1, &instance.name);
-        }
-        if (family->arg_count_ >= 2) {
-          AppendHumanReadable(instance.arg2, &instance.name);
-        }
-        if (!IsZero(family->min_time_)) {
-          instance.name +=  StringPrintF("/min_time:%0.3f",  family->min_time_);
-        }
-        if (family->repetitions_ != 0) {
-          instance.name +=  StringPrintF("/repeats:%d",  family->repetitions_);
-        }
-        if (family->use_manual_time_) {
-          instance.name +=  "/manual_time";
-        } else if (family->use_real_time_) {
-          instance.name +=  "/real_time";
-        }
-
-        // Add the number of threads used to the name
-        if (!family->thread_counts_.empty()) {
-          instance.name += StringPrintF("/threads:%d", instance.threads);
-        }
-
-        if (re.Match(instance.name)) {
-          instance.last_benchmark_instance = (args == family->args_.back());
-          benchmarks->push_back(instance);
-        }
-      }
-    }
-  }
-  return true;
-}
-
-BenchmarkImp::BenchmarkImp(const char* name)
-    : name_(name), arg_count_(-1), time_unit_(kNanosecond),
-      range_multiplier_(kRangeMultiplier), min_time_(0.0), repetitions_(0),
-      use_real_time_(false), use_manual_time_(false),
-      complexity_(oNone) {
-}
-
-BenchmarkImp::~BenchmarkImp() {
-}
-
-void BenchmarkImp::Arg(int x) {
-  CHECK(arg_count_ == -1 || arg_count_ == 1);
-  arg_count_ = 1;
-  args_.emplace_back(x, -1);
-}
-
-void BenchmarkImp::Unit(TimeUnit unit) {
-  time_unit_ = unit;
-}
-
-void BenchmarkImp::Range(int start, int limit) {
-  CHECK(arg_count_ == -1 || arg_count_ == 1);
-  arg_count_ = 1;
-  std::vector<int> arglist;
-  AddRange(&arglist, start, limit, range_multiplier_);
-
-  for (int i : arglist) {
-    args_.emplace_back(i, -1);
-  }
-}
-
-void BenchmarkImp::DenseRange(int start, int limit) {
-  CHECK(arg_count_ == -1 || arg_count_ == 1);
-  arg_count_ = 1;
-  CHECK_GE(start, 0);
-  CHECK_LE(start, limit);
-  for (int arg = start; arg <= limit; arg++) {
-    args_.emplace_back(arg, -1);
-  }
-}
-
-void BenchmarkImp::ArgPair(int x, int y) {
-  CHECK(arg_count_ == -1 || arg_count_ == 2);
-  arg_count_ = 2;
-  args_.emplace_back(x, y);
-}
-
-void BenchmarkImp::RangePair(int lo1, int hi1, int lo2, int hi2) {
-  CHECK(arg_count_ == -1 || arg_count_ == 2);
-  arg_count_ = 2;
-  std::vector<int> arglist1, arglist2;
-  AddRange(&arglist1, lo1, hi1, range_multiplier_);
-  AddRange(&arglist2, lo2, hi2, range_multiplier_);
-
-  for (int i : arglist1) {
-    for (int j : arglist2) {
-      args_.emplace_back(i, j);
-    }
-  }
-}
-
-void BenchmarkImp::RangeMultiplier(int multiplier) {
-  CHECK(multiplier > 1);
-  range_multiplier_ = multiplier;
-}
-
-void BenchmarkImp::MinTime(double t) {
-  CHECK(t > 0.0);
-  min_time_ = t;
-}
-
-
-void BenchmarkImp::Repetitions(int n) {
-  CHECK(n > 0);
-  repetitions_ = n;
-}
-
-void BenchmarkImp::UseRealTime() {
-  CHECK(!use_manual_time_) << "Cannot set UseRealTime and UseManualTime simultaneously.";
-  use_real_time_ = true;
-}
-
-void BenchmarkImp::UseManualTime() {
-  CHECK(!use_real_time_) << "Cannot set UseRealTime and UseManualTime simultaneously.";
-  use_manual_time_ = true;
-}
-
-void BenchmarkImp::Complexity(BigO complexity){
-  complexity_ = complexity;
-}
-
-void BenchmarkImp::ComplexityLambda(BigOFunc* complexity) {
-  complexity_lambda_ = complexity;
-}
-
-void BenchmarkImp::Threads(int t) {
-  CHECK_GT(t, 0);
-  thread_counts_.push_back(t);
-}
-
-void BenchmarkImp::ThreadRange(int min_threads, int max_threads) {
-  CHECK_GT(min_threads, 0);
-  CHECK_GE(max_threads, min_threads);
-
-  AddRange(&thread_counts_, min_threads, max_threads, 2);
-}
-
-void BenchmarkImp::ThreadPerCpu() {
-  static int num_cpus = NumCPUs();
-  thread_counts_.push_back(num_cpus);
-}
-
-void BenchmarkImp::SetName(const char* name) {
-  name_ = name;
-}
-
-void BenchmarkImp::AddRange(std::vector<int>* dst, int lo, int hi, int mult) {
-  CHECK_GE(lo, 0);
-  CHECK_GE(hi, lo);
-  CHECK_GE(mult, 2);
-
-  // Add "lo"
-  dst->push_back(lo);
-
-  static const int kint32max = std::numeric_limits<int32_t>::max();
-
-  // Now space out the benchmarks in multiples of "mult"
-  for (int32_t i = 1; i < kint32max/mult; i *= mult) {
-    if (i >= hi) break;
-    if (i > lo) {
-      dst->push_back(i);
-    }
-  }
-  // Add "hi" (if different from "lo")
-  if (hi != lo) {
-    dst->push_back(hi);
-  }
-}
-
-Benchmark::Benchmark(const char* name)
-    : imp_(new BenchmarkImp(name))
-{
-}
-
-Benchmark::~Benchmark()  {
-  delete imp_;
-}
-
-Benchmark::Benchmark(Benchmark const& other)
-  : imp_(new BenchmarkImp(*other.imp_))
-{
-}
-
-Benchmark* Benchmark::Arg(int x) {
-  imp_->Arg(x);
-  return this;
-}
-
-Benchmark* Benchmark::Unit(TimeUnit unit) {
-  imp_->Unit(unit);
-  return this;
-}
-
-Benchmark* Benchmark::Range(int start, int limit) {
-  imp_->Range(start, limit);
-  return this;
-}
-
-Benchmark* Benchmark::DenseRange(int start, int limit) {
-  imp_->DenseRange(start, limit);
-  return this;
-}
-
-Benchmark* Benchmark::ArgPair(int x, int y) {
-  imp_->ArgPair(x, y);
-  return this;
-}
-
-Benchmark* Benchmark::RangePair(int lo1, int hi1, int lo2, int hi2) {
-  imp_->RangePair(lo1, hi1, lo2, hi2);
-  return this;
-}
-
-Benchmark* Benchmark::Apply(void (*custom_arguments)(Benchmark* benchmark)) {
-  custom_arguments(this);
-  return this;
-}
-
-Benchmark* Benchmark::RangeMultiplier(int multiplier) {
-  imp_->RangeMultiplier(multiplier);
-  return this;
-}
-
-
-Benchmark* Benchmark::Repetitions(int t) {
-  imp_->Repetitions(t);
-  return this;
-}
-
-Benchmark* Benchmark::MinTime(double t) {
-  imp_->MinTime(t);
-  return this;
-}
-
-Benchmark* Benchmark::UseRealTime() {
-  imp_->UseRealTime();
-  return this;
-}
-
-Benchmark* Benchmark::UseManualTime() {
-  imp_->UseManualTime();
-  return this;
-}
-
-Benchmark* Benchmark::Complexity(BigO complexity) {
-  imp_->Complexity(complexity);
-  return this;
-}
-
-Benchmark* Benchmark::Complexity(BigOFunc* complexity) {
-  imp_->Complexity(oLambda);
-  imp_->ComplexityLambda(complexity);
-  return this;
-}
-
-Benchmark* Benchmark::Threads(int t) {
-  imp_->Threads(t);
-  return this;
-}
-
-Benchmark* Benchmark::ThreadRange(int min_threads, int max_threads) {
-  imp_->ThreadRange(min_threads, max_threads);
-  return this;
-}
-
-Benchmark* Benchmark::ThreadPerCpu() {
-  imp_->ThreadPerCpu();
-  return this;
-}
-
-void Benchmark::SetName(const char* name) {
-  imp_->SetName(name);
-}
-
-void FunctionBenchmark::Run(State& st) {
-  func_(st);
-}
-
-} // end namespace internal
-
 namespace {
 
+BenchmarkReporter::Run CreateRunReport(
+    const benchmark::internal::Benchmark::Instance& b,
+    const internal::ThreadManager::Result& results, size_t iters,
+    double seconds) {
+  // Create report about this benchmark run.
+  BenchmarkReporter::Run report;
+
+  report.benchmark_name = b.name;
+  report.error_occurred = results.has_error_;
+  report.error_message = results.error_message_;
+  report.report_label = results.report_label_;
+  // Report the total iterations across all threads.
+  report.iterations = static_cast<int64_t>(iters) * b.threads;
+  report.time_unit = b.time_unit;
+
+  if (!report.error_occurred) {
+    double bytes_per_second = 0;
+    if (results.bytes_processed > 0 && seconds > 0.0) {
+      bytes_per_second = (results.bytes_processed / seconds);
+    }
+    double items_per_second = 0;
+    if (results.items_processed > 0 && seconds > 0.0) {
+      items_per_second = (results.items_processed / seconds);
+    }
+
+    if (b.use_manual_time) {
+      report.real_accumulated_time = results.manual_time_used;
+    } else {
+      report.real_accumulated_time = results.real_time_used;
+    }
+    report.cpu_accumulated_time = results.cpu_time_used;
+    report.bytes_per_second = bytes_per_second;
+    report.items_per_second = items_per_second;
+    report.complexity_n = results.complexity_n;
+    report.complexity = b.complexity;
+    report.complexity_lambda = b.complexity_lambda;
+  }
+  return report;
+}
+
 // Execute one thread of benchmark b for the specified number of iterations.
 // Adds the stats collected for the thread into *total.
 void RunInThread(const benchmark::internal::Benchmark::Instance* b,
                  size_t iters, int thread_id,
-                 ThreadStats* total) EXCLUDES(GetBenchmarkLock()) {
-  State st(iters, b->has_arg1, b->arg1, b->has_arg2, b->arg2, thread_id, b->threads);
+                 internal::ThreadManager* manager) {
+  internal::ThreadTimer timer;
+  State st(iters, b->arg, thread_id, b->threads, &timer, manager);
   b->benchmark->Run(st);
-  CHECK(st.iterations() == st.max_iterations) <<
-    "Benchmark returned before State::KeepRunning() returned false!";
+  CHECK(st.iterations() == st.max_iterations)
+      << "Benchmark returned before State::KeepRunning() returned false!";
   {
-    MutexLock l(GetBenchmarkLock());
-    total->bytes_processed += st.bytes_processed();
-    total->items_processed += st.items_processed();
-    total->complexity_n += st.complexity_length_n();
+    MutexLock l(manager->GetBenchmarkMutex());
+    internal::ThreadManager::Result& results = manager->results;
+    results.cpu_time_used += timer.cpu_time_used();
+    results.real_time_used += timer.real_time_used();
+    results.manual_time_used += timer.manual_time_used();
+    results.bytes_processed += st.bytes_processed();
+    results.items_processed += st.items_processed();
+    results.complexity_n += st.complexity_length_n();
   }
-
-  timer_manager->Finalize();
+  manager->NotifyThreadComplete();
 }
 
-void RunBenchmark(const benchmark::internal::Benchmark::Instance& b,
-                  BenchmarkReporter* br,
-                  std::vector<BenchmarkReporter::Run>& complexity_reports)
-  EXCLUDES(GetBenchmarkLock()) {
+std::vector<BenchmarkReporter::Run> RunBenchmark(
+    const benchmark::internal::Benchmark::Instance& b,
+    std::vector<BenchmarkReporter::Run>* complexity_reports) {
+  std::vector<BenchmarkReporter::Run> reports;  // return value
+
   size_t iters = 1;
-
-  std::vector<BenchmarkReporter::Run> reports;
-
-  std::vector<std::thread> pool;
-  if (b.multithreaded)
-    pool.resize(b.threads);
-
-  const int repeats = b.repetitions != 0 ? b.repetitions
-                                         : FLAGS_benchmark_repetitions;
+  std::unique_ptr<internal::ThreadManager> manager;
+  std::vector<std::thread> pool(b.threads - 1);
+  const int repeats =
+      b.repetitions != 0 ? b.repetitions : FLAGS_benchmark_repetitions;
+  const bool report_aggregates_only =
+      repeats != 1 &&
+      (b.report_mode == internal::RM_Unspecified
+           ? FLAGS_benchmark_report_aggregates_only
+           : b.report_mode == internal::RM_ReportAggregatesOnly);
   for (int i = 0; i < repeats; i++) {
-    std::string mem;
     for (;;) {
       // Try benchmark
       VLOG(2) << "Running " << b.name << " for " << iters << "\n";
 
+      manager.reset(new internal::ThreadManager(b.threads));
+      for (std::size_t ti = 0; ti < pool.size(); ++ti) {
+        pool[ti] = std::thread(&RunInThread, &b, iters,
+                               static_cast<int>(ti + 1), manager.get());
+      }
+      RunInThread(&b, iters, 0, manager.get());
+      manager->WaitForAllThreads();
+      for (std::thread& thread : pool) thread.join();
+      internal::ThreadManager::Result results;
       {
-        MutexLock l(GetBenchmarkLock());
-        GetReportLabel()->clear();
+        MutexLock l(manager->GetBenchmarkMutex());
+        results = manager->results;
       }
-      error_message = nullptr;
+      manager.reset();
+      // Adjust real/manual time stats since they were reported per thread.
+      results.real_time_used /= b.threads;
+      results.manual_time_used /= b.threads;
 
-      Notification done;
-      timer_manager = std::unique_ptr<TimerManager>(new TimerManager(b.threads, &done));
-
-      ThreadStats total;
-      running_benchmark = true;
-      if (b.multithreaded) {
-        // If this is out first iteration of the while(true) loop then the
-        // threads haven't been started and can't be joined. Otherwise we need
-        // to join the thread before replacing them.
-        for (std::thread& thread : pool) {
-          if (thread.joinable())
-            thread.join();
-        }
-        for (std::size_t ti = 0; ti < pool.size(); ++ti) {
-            pool[ti] = std::thread(&RunInThread, &b, iters, static_cast<int>(ti), &total);
-        }
-      } else {
-        // Run directly in this thread
-        RunInThread(&b, iters, 0, &total);
-      }
-      done.WaitForNotification();
-      running_benchmark = false;
-
-      const double cpu_accumulated_time = timer_manager->cpu_time_used();
-      const double real_accumulated_time = timer_manager->real_time_used();
-      const double manual_accumulated_time = timer_manager->manual_time_used();
-      timer_manager.reset();
-
-      VLOG(2) << "Ran in " << cpu_accumulated_time << "/"
-              << real_accumulated_time << "\n";
+      VLOG(2) << "Ran in " << results.cpu_time_used << "/"
+              << results.real_time_used << "\n";
 
       // Base decisions off of real time if requested by this benchmark.
-      double seconds = cpu_accumulated_time;
+      double seconds = results.cpu_time_used;
       if (b.use_manual_time) {
-          seconds = manual_accumulated_time;
+        seconds = results.manual_time_used;
       } else if (b.use_real_time) {
-          seconds = real_accumulated_time;
+        seconds = results.real_time_used;
       }
 
-      std::string label;
-      {
-        MutexLock l(GetBenchmarkLock());
-        label = *GetReportLabel();
-      }
-      error_message_type error_msg = error_message;
-
-      const double min_time = !IsZero(b.min_time) ? b.min_time
-                                                  : FLAGS_benchmark_min_time;
-
+      const double min_time =
+          !IsZero(b.min_time) ? b.min_time : FLAGS_benchmark_min_time;
       // If this was the first run, was elapsed time or cpu time large enough?
       // If this is not the first run, go with the current value of iter.
-      if ((i > 0) || (error_msg != nullptr) ||
-          (iters >= kMaxIterations) ||
-          (seconds >= min_time) ||
-          (real_accumulated_time >= 5*min_time)) {
-
-        // Create report about this benchmark run.
-        BenchmarkReporter::Run report;
-        report.benchmark_name = b.name;
-        report.error_occurred = error_msg != nullptr;
-        report.error_message = error_msg != nullptr ? error_msg : "";
-        report.report_label = label;
-        // Report the total iterations across all threads.
-        report.iterations = static_cast<int64_t>(iters) * b.threads;
-        report.time_unit = b.time_unit;
-
-        if (!report.error_occurred) {
-          double bytes_per_second = 0;
-          if (total.bytes_processed > 0 && seconds > 0.0) {
-            bytes_per_second = (total.bytes_processed / seconds);
-          }
-          double items_per_second = 0;
-          if (total.items_processed > 0 && seconds > 0.0) {
-            items_per_second = (total.items_processed / seconds);
-          }
-
-          if (b.use_manual_time) {
-            report.real_accumulated_time = manual_accumulated_time;
-          } else {
-            report.real_accumulated_time = real_accumulated_time;
-          }
-          report.cpu_accumulated_time = cpu_accumulated_time;
-          report.bytes_per_second = bytes_per_second;
-          report.items_per_second = items_per_second;
-          report.complexity_n = total.complexity_n;
-          report.complexity = b.complexity;
-          report.complexity_lambda = b.complexity_lambda;
-          if(report.complexity != oNone)
-            complexity_reports.push_back(report);
-        }
-
+      if ((i > 0) || results.has_error_ || (iters >= kMaxIterations) ||
+          (seconds >= min_time) || (results.real_time_used >= 5 * min_time)) {
+        BenchmarkReporter::Run report =
+            CreateRunReport(b, results, iters, seconds);
+        if (!report.error_occurred && b.complexity != oNone)
+          complexity_reports->push_back(report);
         reports.push_back(report);
         break;
       }
@@ -898,85 +358,103 @@
       iters = static_cast<int>(next_iters + 0.5);
     }
   }
-  std::vector<BenchmarkReporter::Run> additional_run_stats = ComputeStats(reports);
-  reports.insert(reports.end(), additional_run_stats.begin(),
-                 additional_run_stats.end());
-
-  if((b.complexity != oNone) && b.last_benchmark_instance) {
-    additional_run_stats = ComputeBigO(complexity_reports);
-    reports.insert(reports.end(), additional_run_stats.begin(),
-                   additional_run_stats.end());
-    complexity_reports.clear();
+  // Calculate additional statistics
+  auto stat_reports = ComputeStats(reports);
+  if ((b.complexity != oNone) && b.last_benchmark_instance) {
+    auto additional_run_stats = ComputeBigO(*complexity_reports);
+    stat_reports.insert(stat_reports.end(), additional_run_stats.begin(),
+                        additional_run_stats.end());
+    complexity_reports->clear();
   }
 
-  br->ReportRuns(reports);
-
-  if (b.multithreaded) {
-    for (std::thread& thread : pool)
-      thread.join();
-  }
+  if (report_aggregates_only) reports.clear();
+  reports.insert(reports.end(), stat_reports.begin(), stat_reports.end());
+  return reports;
 }
 
 }  // namespace
+}  // namespace internal
 
-State::State(size_t max_iters, bool has_x, int x, bool has_y, int y,
-             int thread_i, int n_threads)
-    : started_(false), finished_(false), total_iterations_(0),
-      has_range_x_(has_x), range_x_(x),
-      has_range_y_(has_y), range_y_(y),
-      bytes_processed_(0), items_processed_(0),
+State::State(size_t max_iters, const std::vector<int>& ranges, int thread_i,
+             int n_threads, internal::ThreadTimer* timer,
+             internal::ThreadManager* manager)
+    : started_(false),
+      finished_(false),
+      total_iterations_(0),
+      range_(ranges),
+      bytes_processed_(0),
+      items_processed_(0),
       complexity_n_(0),
       error_occurred_(false),
       thread_index(thread_i),
       threads(n_threads),
-      max_iterations(max_iters)
-{
-    CHECK(max_iterations != 0) << "At least one iteration must be run";
-    CHECK_LT(thread_index, threads) << "thread_index must be less than threads";
+      max_iterations(max_iters),
+      timer_(timer),
+      manager_(manager) {
+  CHECK(max_iterations != 0) << "At least one iteration must be run";
+  CHECK_LT(thread_index, threads) << "thread_index must be less than threads";
 }
 
 void State::PauseTiming() {
   // Add in time accumulated so far
-  CHECK(running_benchmark);
   CHECK(started_ && !finished_ && !error_occurred_);
-  timer_manager->StopTimer();
+  timer_->StopTimer();
 }
 
 void State::ResumeTiming() {
-  CHECK(running_benchmark);
   CHECK(started_ && !finished_ && !error_occurred_);
-  timer_manager->StartTimer();
+  timer_->StartTimer();
 }
 
 void State::SkipWithError(const char* msg) {
   CHECK(msg);
   error_occurred_ = true;
-  error_message_type expected_no_error_msg = nullptr;
-  error_message.compare_exchange_weak(expected_no_error_msg,
-    const_cast<error_message_type>(msg));
-  started_ = finished_ = true;
+  {
+    MutexLock l(manager_->GetBenchmarkMutex());
+    if (manager_->results.has_error_ == false) {
+      manager_->results.error_message_ = msg;
+      manager_->results.has_error_ = true;
+    }
+  }
   total_iterations_ = max_iterations;
-  timer_manager->RemoveErroredThread();
+  if (timer_->running()) timer_->StopTimer();
 }
 
-void State::SetIterationTime(double seconds)
-{
-  CHECK(running_benchmark);
-  timer_manager->SetIterationTime(seconds);
+void State::SetIterationTime(double seconds) {
+  timer_->SetIterationTime(seconds);
 }
 
 void State::SetLabel(const char* label) {
-  CHECK(running_benchmark);
-  MutexLock l(GetBenchmarkLock());
-  *GetReportLabel() = label;
+  MutexLock l(manager_->GetBenchmarkMutex());
+  manager_->results.report_label_ = label;
+}
+
+void State::StartKeepRunning() {
+  CHECK(!started_ && !finished_);
+  started_ = true;
+  manager_->StartStopBarrier();
+  if (!error_occurred_) ResumeTiming();
+}
+
+void State::FinishKeepRunning() {
+  CHECK(started_ && (!finished_ || error_occurred_));
+  if (!error_occurred_) {
+    PauseTiming();
+  }
+  // Total iterations now is one greater than max iterations. Fix this.
+  total_iterations_ = max_iterations;
+  finished_ = true;
+  manager_->StartStopBarrier();
 }
 
 namespace internal {
 namespace {
 
-void RunMatchingBenchmarks(const std::vector<Benchmark::Instance>& benchmarks,
-                           BenchmarkReporter* reporter) {
-  CHECK(reporter != nullptr);
+void RunBenchmarks(const std::vector<Benchmark::Instance>& benchmarks,
+                           BenchmarkReporter* console_reporter,
+                           BenchmarkReporter* file_reporter) {
+  // Note the file_reporter can be null.
+  CHECK(console_reporter != nullptr);
 
   // Determine the width of the name field using a minimum width of 10.
   bool has_repetitions = FLAGS_benchmark_repetitions > 1;
@@ -986,8 +464,7 @@
         std::max<size_t>(name_field_width, benchmark.name.size());
     has_repetitions |= benchmark.repetitions > 1;
   }
-  if (has_repetitions)
-    name_field_width += std::strlen("_stddev");
+  if (has_repetitions) name_field_width += std::strlen("_stddev");
 
   // Print header here
   BenchmarkReporter::Context context;
@@ -1000,55 +477,121 @@
   // Keep track of runing times of all instances of current benchmark
   std::vector<BenchmarkReporter::Run> complexity_reports;
 
-  if (reporter->ReportContext(context)) {
+  // We flush streams after invoking reporter methods that write to them. This
+  // ensures users get timely updates even when streams are not line-buffered.
+  auto flushStreams = [](BenchmarkReporter* reporter) {
+    if (!reporter) return;
+    std::flush(reporter->GetOutputStream());
+    std::flush(reporter->GetErrorStream());
+  };
+
+  if (console_reporter->ReportContext(context) &&
+      (!file_reporter || file_reporter->ReportContext(context))) {
+    flushStreams(console_reporter);
+    flushStreams(file_reporter);
     for (const auto& benchmark : benchmarks) {
-      RunBenchmark(benchmark, reporter, complexity_reports);
+      std::vector<BenchmarkReporter::Run> reports =
+          RunBenchmark(benchmark, &complexity_reports);
+      console_reporter->ReportRuns(reports);
+      if (file_reporter) file_reporter->ReportRuns(reports);
+      flushStreams(console_reporter);
+      flushStreams(file_reporter);
     }
   }
+  console_reporter->Finalize();
+  if (file_reporter) file_reporter->Finalize();
+  flushStreams(console_reporter);
+  flushStreams(file_reporter);
 }
 
-std::unique_ptr<BenchmarkReporter> GetDefaultReporter() {
+std::unique_ptr<BenchmarkReporter> CreateReporter(
+    std::string const& name, ConsoleReporter::OutputOptions allow_color) {
   typedef std::unique_ptr<BenchmarkReporter> PtrType;
-  if (FLAGS_benchmark_format == "console") {
-    return PtrType(new ConsoleReporter);
-  } else if (FLAGS_benchmark_format == "json") {
+  if (name == "console") {
+    return PtrType(new ConsoleReporter(allow_color));
+  } else if (name == "json") {
     return PtrType(new JSONReporter);
-  } else if (FLAGS_benchmark_format == "csv") {
+  } else if (name == "csv") {
     return PtrType(new CSVReporter);
   } else {
-    std::cerr << "Unexpected format: '" << FLAGS_benchmark_format << "'\n";
+    std::cerr << "Unexpected format: '" << name << "'\n";
     std::exit(1);
   }
 }
 
-} // end namespace
-} // end namespace internal
+}  // end namespace
+}  // end namespace internal
 
 size_t RunSpecifiedBenchmarks() {
-  return RunSpecifiedBenchmarks(nullptr);
+  return RunSpecifiedBenchmarks(nullptr, nullptr);
 }
 
-size_t RunSpecifiedBenchmarks(BenchmarkReporter* reporter) {
+size_t RunSpecifiedBenchmarks(BenchmarkReporter* console_reporter) {
+  return RunSpecifiedBenchmarks(console_reporter, nullptr);
+}
+
+size_t RunSpecifiedBenchmarks(BenchmarkReporter* console_reporter,
+                              BenchmarkReporter* file_reporter) {
   std::string spec = FLAGS_benchmark_filter;
   if (spec.empty() || spec == "all")
     spec = ".";  // Regexp that matches all benchmarks
 
+  // Setup the reporters
+  std::ofstream output_file;
+  std::unique_ptr<BenchmarkReporter> default_console_reporter;
+  std::unique_ptr<BenchmarkReporter> default_file_reporter;
+  if (!console_reporter) {
+    auto output_opts = ConsoleReporter::OO_None;
+    if (FLAGS_benchmark_color == "auto")
+      output_opts = IsColorTerminal() ? ConsoleReporter::OO_Color
+                                      : ConsoleReporter::OO_None;
+    else
+      output_opts = IsTruthyFlagValue(FLAGS_benchmark_color)
+                        ? ConsoleReporter::OO_Color
+                        : ConsoleReporter::OO_None;
+    default_console_reporter =
+        internal::CreateReporter(FLAGS_benchmark_format, output_opts);
+    console_reporter = default_console_reporter.get();
+  }
+  auto& Out = console_reporter->GetOutputStream();
+  auto& Err = console_reporter->GetErrorStream();
+
+  std::string const& fname = FLAGS_benchmark_out;
+  if (fname == "" && file_reporter) {
+    Err << "A custom file reporter was provided but "
+           "--benchmark_out=<file> was not specified."
+        << std::endl;
+    std::exit(1);
+  }
+  if (fname != "") {
+    output_file.open(fname);
+    if (!output_file.is_open()) {
+      Err << "invalid file name: '" << fname << std::endl;
+      std::exit(1);
+    }
+    if (!file_reporter) {
+      default_file_reporter = internal::CreateReporter(
+          FLAGS_benchmark_out_format, ConsoleReporter::OO_None);
+      file_reporter = default_file_reporter.get();
+    }
+    file_reporter->SetOutputStream(&output_file);
+    file_reporter->SetErrorStream(&output_file);
+  }
+
   std::vector<internal::Benchmark::Instance> benchmarks;
-  auto families = internal::BenchmarkFamilies::GetInstance();
-  if (!families->FindBenchmarks(spec, &benchmarks)) return 0;
+  if (!FindBenchmarksInternal(spec, &benchmarks, &Err)) return 0;
+
+  if (benchmarks.empty()) {
+    Err << "Failed to match any benchmarks against regex: " << spec << "\n";
+    return 0;
+  }
 
   if (FLAGS_benchmark_list_tests) {
-    for (auto const& benchmark : benchmarks)
-      std::cout <<  benchmark.name << "\n";
+    for (auto const& benchmark : benchmarks) Out << benchmark.name << "\n";
   } else {
-    std::unique_ptr<BenchmarkReporter> default_reporter;
-    if (!reporter) {
-      default_reporter = internal::GetDefaultReporter();
-      reporter = default_reporter.get();
-    }
-    internal::RunMatchingBenchmarks(benchmarks, reporter);
-    reporter->Finalize();
+    internal::RunBenchmarks(benchmarks, console_reporter, file_reporter);
   }
+
   return benchmarks.size();
 }
 
@@ -1061,8 +604,11 @@
           "          [--benchmark_filter=<regex>]\n"
           "          [--benchmark_min_time=<min_time>]\n"
           "          [--benchmark_repetitions=<num_repetitions>]\n"
+          "          [--benchmark_report_aggregates_only={true|false}\n"
           "          [--benchmark_format=<console|json|csv>]\n"
-          "          [--color_print={true|false}]\n"
+          "          [--benchmark_out=<filename>]\n"
+          "          [--benchmark_out_format=<json|console|csv>]\n"
+          "          [--benchmark_color={auto|true|false}]\n"
           "          [--v=<verbosity>]\n");
   exit(0);
 }
@@ -1070,19 +616,23 @@
 void ParseCommandLineFlags(int* argc, char** argv) {
   using namespace benchmark;
   for (int i = 1; i < *argc; ++i) {
-    if (
-        ParseBoolFlag(argv[i], "benchmark_list_tests",
+    if (ParseBoolFlag(argv[i], "benchmark_list_tests",
                       &FLAGS_benchmark_list_tests) ||
-        ParseStringFlag(argv[i], "benchmark_filter",
-                        &FLAGS_benchmark_filter) ||
+        ParseStringFlag(argv[i], "benchmark_filter", &FLAGS_benchmark_filter) ||
         ParseDoubleFlag(argv[i], "benchmark_min_time",
                         &FLAGS_benchmark_min_time) ||
         ParseInt32Flag(argv[i], "benchmark_repetitions",
                        &FLAGS_benchmark_repetitions) ||
-        ParseStringFlag(argv[i], "benchmark_format",
-                        &FLAGS_benchmark_format) ||
-        ParseBoolFlag(argv[i], "color_print",
-                       &FLAGS_color_print) ||
+        ParseBoolFlag(argv[i], "benchmark_report_aggregates_only",
+                      &FLAGS_benchmark_report_aggregates_only) ||
+        ParseStringFlag(argv[i], "benchmark_format", &FLAGS_benchmark_format) ||
+        ParseStringFlag(argv[i], "benchmark_out", &FLAGS_benchmark_out) ||
+        ParseStringFlag(argv[i], "benchmark_out_format",
+                        &FLAGS_benchmark_out_format) ||
+        ParseStringFlag(argv[i], "benchmark_color", &FLAGS_benchmark_color) ||
+        // "color_print" is the deprecated name for "benchmark_color".
+        // TODO: Remove this.
+        ParseStringFlag(argv[i], "color_print", &FLAGS_benchmark_color) ||
         ParseInt32Flag(argv[i], "v", &FLAGS_v)) {
       for (int j = i; j != *argc; ++j) argv[j] = argv[j + 1];
 
@@ -1092,32 +642,26 @@
       PrintUsageAndExit();
     }
   }
-
-  if (FLAGS_benchmark_format != "console" &&
-      FLAGS_benchmark_format != "json" &&
-      FLAGS_benchmark_format != "csv") {
+  for (auto const* flag :
+       {&FLAGS_benchmark_format, &FLAGS_benchmark_out_format})
+    if (*flag != "console" && *flag != "json" && *flag != "csv") {
+      PrintUsageAndExit();
+    }
+  if (FLAGS_benchmark_color.empty()) {
     PrintUsageAndExit();
   }
 }
 
-Benchmark* RegisterBenchmarkInternal(Benchmark* bench) {
-    std::unique_ptr<Benchmark> bench_ptr(bench);
-    BenchmarkFamilies* families = BenchmarkFamilies::GetInstance();
-    families->AddBenchmark(std::move(bench_ptr));
-    return bench;
+int InitializeStreams() {
+  static std::ios_base::Init init;
+  return 0;
 }
 
-} // end namespace internal
+}  // end namespace internal
 
 void Initialize(int* argc, char** argv) {
   internal::ParseCommandLineFlags(argc, argv);
-  internal::SetLogLevel(FLAGS_v);
-  // TODO remove this. It prints some output the first time it is called.
-  // We don't want to have this ouput printed during benchmarking.
-  MyCPUUsage();
-  // The first call to walltime::Now initialized it. Call it once to
-  // prevent the initialization from happening in a benchmark.
-  walltime::Now();
+  internal::LogLevel() = FLAGS_v;
 }
 
-} // end namespace benchmark
+}  // end namespace benchmark
diff --git a/src/benchmark_api_internal.h b/src/benchmark_api_internal.h
new file mode 100644
index 0000000..8b97ce6
--- /dev/null
+++ b/src/benchmark_api_internal.h
@@ -0,0 +1,47 @@
+#ifndef BENCHMARK_API_INTERNAL_H
+#define BENCHMARK_API_INTERNAL_H
+
+#include "benchmark/benchmark_api.h"
+
+#include <cmath>
+#include <iosfwd>
+#include <limits>
+#include <string>
+#include <vector>
+
+namespace benchmark {
+namespace internal {
+
+// Information kept per benchmark we may want to run
+struct Benchmark::Instance {
+  std::string name;
+  Benchmark* benchmark;
+  ReportMode report_mode;
+  std::vector<int> arg;
+  TimeUnit time_unit;
+  int range_multiplier;
+  bool use_real_time;
+  bool use_manual_time;
+  BigO complexity;
+  BigOFunc* complexity_lambda;
+  bool last_benchmark_instance;
+  int repetitions;
+  double min_time;
+  int threads;  // Number of concurrent threads to us
+};
+
+bool FindBenchmarksInternal(const std::string& re,
+                            std::vector<Benchmark::Instance>* benchmarks,
+                            std::ostream* Err);
+
+namespace {
+
+bool IsZero(double n) {
+  return std::abs(n) < std::numeric_limits<double>::epsilon();
+}
+
+}  // end namespace
+}  // end namespace internal
+}  // end namespace benchmark
+
+#endif  // BENCHMARK_API_INTERNAL_H
diff --git a/src/benchmark_register.cc b/src/benchmark_register.cc
new file mode 100644
index 0000000..4e580d8
--- /dev/null
+++ b/src/benchmark_register.cc
@@ -0,0 +1,439 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "benchmark/benchmark.h"
+#include "benchmark_api_internal.h"
+#include "internal_macros.h"
+
+#ifndef BENCHMARK_OS_WINDOWS
+#include <sys/resource.h>
+#include <sys/time.h>
+#include <unistd.h>
+#endif
+
+#include <algorithm>
+#include <atomic>
+#include <condition_variable>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <thread>
+
+#include "check.h"
+#include "commandlineflags.h"
+#include "complexity.h"
+#include "log.h"
+#include "mutex.h"
+#include "re.h"
+#include "stat.h"
+#include "string_util.h"
+#include "sysinfo.h"
+#include "timers.h"
+
+namespace benchmark {
+
+namespace {
+// For non-dense Range, intermediate values are powers of kRangeMultiplier.
+static const int kRangeMultiplier = 8;
+// The size of a benchmark family determines is the number of inputs to repeat
+// the benchmark on. If this is "large" then warn the user during configuration.
+static const size_t kMaxFamilySize = 100;
+}  // end namespace
+
+namespace internal {
+
+//=============================================================================//
+//                         BenchmarkFamilies
+//=============================================================================//
+
+// Class for managing registered benchmarks.  Note that each registered
+// benchmark identifies a family of related benchmarks to run.
+class BenchmarkFamilies {
+ public:
+  static BenchmarkFamilies* GetInstance();
+
+  // Registers a benchmark family and returns the index assigned to it.
+  size_t AddBenchmark(std::unique_ptr<Benchmark> family);
+
+  // Extract the list of benchmark instances that match the specified
+  // regular expression.
+  bool FindBenchmarks(const std::string& re,
+                      std::vector<Benchmark::Instance>* benchmarks,
+                      std::ostream* Err);
+
+ private:
+  BenchmarkFamilies() {}
+
+  std::vector<std::unique_ptr<Benchmark>> families_;
+  Mutex mutex_;
+};
+
+BenchmarkFamilies* BenchmarkFamilies::GetInstance() {
+  static BenchmarkFamilies instance;
+  return &instance;
+}
+
+size_t BenchmarkFamilies::AddBenchmark(std::unique_ptr<Benchmark> family) {
+  MutexLock l(mutex_);
+  size_t index = families_.size();
+  families_.push_back(std::move(family));
+  return index;
+}
+
+bool BenchmarkFamilies::FindBenchmarks(
+    const std::string& spec, std::vector<Benchmark::Instance>* benchmarks,
+    std::ostream* ErrStream) {
+  CHECK(ErrStream);
+  auto& Err = *ErrStream;
+  // Make regular expression out of command-line flag
+  std::string error_msg;
+  Regex re;
+  if (!re.Init(spec, &error_msg)) {
+    Err << "Could not compile benchmark re: " << error_msg << std::endl;
+    return false;
+  }
+
+  // Special list of thread counts to use when none are specified
+  const std::vector<int> one_thread = {1};
+
+  MutexLock l(mutex_);
+  for (std::unique_ptr<Benchmark>& family : families_) {
+    // Family was deleted or benchmark doesn't match
+    if (!family) continue;
+
+    if (family->ArgsCnt() == -1) {
+      family->Args({});
+    }
+    const std::vector<int>* thread_counts =
+        (family->thread_counts_.empty()
+             ? &one_thread
+             : &static_cast<const std::vector<int>&>(family->thread_counts_));
+    const size_t family_size = family->args_.size() * thread_counts->size();
+    // The benchmark will be run at least 'family_size' different inputs.
+    // If 'family_size' is very large warn the user.
+    if (family_size > kMaxFamilySize) {
+      Err << "The number of inputs is very large. " << family->name_
+          << " will be repeated at least " << family_size << " times.\n";
+    }
+    // reserve in the special case the regex ".", since we know the final
+    // family size.
+    if (spec == ".") benchmarks->reserve(family_size);
+
+    for (auto const& args : family->args_) {
+      for (int num_threads : *thread_counts) {
+        Benchmark::Instance instance;
+        instance.name = family->name_;
+        instance.benchmark = family.get();
+        instance.report_mode = family->report_mode_;
+        instance.arg = args;
+        instance.time_unit = family->time_unit_;
+        instance.range_multiplier = family->range_multiplier_;
+        instance.min_time = family->min_time_;
+        instance.repetitions = family->repetitions_;
+        instance.use_real_time = family->use_real_time_;
+        instance.use_manual_time = family->use_manual_time_;
+        instance.complexity = family->complexity_;
+        instance.complexity_lambda = family->complexity_lambda_;
+        instance.threads = num_threads;
+
+        // Add arguments to instance name
+        size_t arg_i = 0;
+        for (auto const& arg : args) {
+          instance.name += "/";
+
+          if (arg_i < family->arg_names_.size()) {
+            const auto& arg_name = family->arg_names_[arg_i];
+            if (!arg_name.empty()) {
+              instance.name +=
+                  StringPrintF("%s:", family->arg_names_[arg_i].c_str());
+            }
+          }
+
+          AppendHumanReadable(arg, &instance.name);
+          ++arg_i;
+        }
+
+        if (!IsZero(family->min_time_)) {
+          instance.name += StringPrintF("/min_time:%0.3f", family->min_time_);
+        }
+        if (family->repetitions_ != 0) {
+          instance.name += StringPrintF("/repeats:%d", family->repetitions_);
+        }
+        if (family->use_manual_time_) {
+          instance.name += "/manual_time";
+        } else if (family->use_real_time_) {
+          instance.name += "/real_time";
+        }
+
+        // Add the number of threads used to the name
+        if (!family->thread_counts_.empty()) {
+          instance.name += StringPrintF("/threads:%d", instance.threads);
+        }
+
+        if (re.Match(instance.name)) {
+          instance.last_benchmark_instance = (&args == &family->args_.back());
+          benchmarks->push_back(std::move(instance));
+        }
+      }
+    }
+  }
+  return true;
+}
+
+Benchmark* RegisterBenchmarkInternal(Benchmark* bench) {
+  std::unique_ptr<Benchmark> bench_ptr(bench);
+  BenchmarkFamilies* families = BenchmarkFamilies::GetInstance();
+  families->AddBenchmark(std::move(bench_ptr));
+  return bench;
+}
+
+// FIXME: This function is a hack so that benchmark.cc can access
+// `BenchmarkFamilies`
+bool FindBenchmarksInternal(const std::string& re,
+                            std::vector<Benchmark::Instance>* benchmarks,
+                            std::ostream* Err) {
+  return BenchmarkFamilies::GetInstance()->FindBenchmarks(re, benchmarks, Err);
+}
+
+//=============================================================================//
+//                               Benchmark
+//=============================================================================//
+
+Benchmark::Benchmark(const char* name)
+    : name_(name),
+      report_mode_(RM_Unspecified),
+      time_unit_(kNanosecond),
+      range_multiplier_(kRangeMultiplier),
+      min_time_(0),
+      repetitions_(0),
+      use_real_time_(false),
+      use_manual_time_(false),
+      complexity_(oNone),
+      complexity_lambda_(nullptr) {}
+
+Benchmark::~Benchmark() {}
+
+void Benchmark::AddRange(std::vector<int>* dst, int lo, int hi, int mult) {
+  CHECK_GE(lo, 0);
+  CHECK_GE(hi, lo);
+  CHECK_GE(mult, 2);
+
+  // Add "lo"
+  dst->push_back(lo);
+
+  static const int kint32max = std::numeric_limits<int32_t>::max();
+
+  // Now space out the benchmarks in multiples of "mult"
+  for (int32_t i = 1; i < kint32max / mult; i *= mult) {
+    if (i >= hi) break;
+    if (i > lo) {
+      dst->push_back(i);
+    }
+  }
+  // Add "hi" (if different from "lo")
+  if (hi != lo) {
+    dst->push_back(hi);
+  }
+}
+
+Benchmark* Benchmark::Arg(int x) {
+  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
+  args_.push_back({x});
+  return this;
+}
+
+Benchmark* Benchmark::Unit(TimeUnit unit) {
+  time_unit_ = unit;
+  return this;
+}
+
+Benchmark* Benchmark::Range(int start, int limit) {
+  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
+  std::vector<int> arglist;
+  AddRange(&arglist, start, limit, range_multiplier_);
+
+  for (int i : arglist) {
+    args_.push_back({i});
+  }
+  return this;
+}
+
+Benchmark* Benchmark::Ranges(const std::vector<std::pair<int, int>>& ranges) {
+  CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(ranges.size()));
+  std::vector<std::vector<int>> arglists(ranges.size());
+  std::size_t total = 1;
+  for (std::size_t i = 0; i < ranges.size(); i++) {
+    AddRange(&arglists[i], ranges[i].first, ranges[i].second,
+             range_multiplier_);
+    total *= arglists[i].size();
+  }
+
+  std::vector<std::size_t> ctr(arglists.size(), 0);
+
+  for (std::size_t i = 0; i < total; i++) {
+    std::vector<int> tmp;
+    tmp.reserve(arglists.size());
+
+    for (std::size_t j = 0; j < arglists.size(); j++) {
+      tmp.push_back(arglists[j].at(ctr[j]));
+    }
+
+    args_.push_back(std::move(tmp));
+
+    for (std::size_t j = 0; j < arglists.size(); j++) {
+      if (ctr[j] + 1 < arglists[j].size()) {
+        ++ctr[j];
+        break;
+      }
+      ctr[j] = 0;
+    }
+  }
+  return this;
+}
+
+Benchmark* Benchmark::ArgName(const std::string& name) {
+  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
+  arg_names_ = {name};
+  return this;
+}
+
+Benchmark* Benchmark::ArgNames(const std::vector<std::string>& names) {
+  CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(names.size()));
+  arg_names_ = names;
+  return this;
+}
+
+Benchmark* Benchmark::DenseRange(int start, int limit, int step) {
+  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
+  CHECK_GE(start, 0);
+  CHECK_LE(start, limit);
+  for (int arg = start; arg <= limit; arg += step) {
+    args_.push_back({arg});
+  }
+  return this;
+}
+
+Benchmark* Benchmark::Args(const std::vector<int>& args) {
+  CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(args.size()));
+  args_.push_back(args);
+  return this;
+}
+
+Benchmark* Benchmark::Apply(void (*custom_arguments)(Benchmark* benchmark)) {
+  custom_arguments(this);
+  return this;
+}
+
+Benchmark* Benchmark::RangeMultiplier(int multiplier) {
+  CHECK(multiplier > 1);
+  range_multiplier_ = multiplier;
+  return this;
+}
+
+Benchmark* Benchmark::Repetitions(int n) {
+  CHECK(n > 0);
+  repetitions_ = n;
+  return this;
+}
+
+Benchmark* Benchmark::ReportAggregatesOnly(bool value) {
+  report_mode_ = value ? RM_ReportAggregatesOnly : RM_Default;
+  return this;
+}
+
+Benchmark* Benchmark::MinTime(double t) {
+  CHECK(t > 0.0);
+  min_time_ = t;
+  return this;
+}
+
+Benchmark* Benchmark::UseRealTime() {
+  CHECK(!use_manual_time_)
+      << "Cannot set UseRealTime and UseManualTime simultaneously.";
+  use_real_time_ = true;
+  return this;
+}
+
+Benchmark* Benchmark::UseManualTime() {
+  CHECK(!use_real_time_)
+      << "Cannot set UseRealTime and UseManualTime simultaneously.";
+  use_manual_time_ = true;
+  return this;
+}
+
+Benchmark* Benchmark::Complexity(BigO complexity) {
+  complexity_ = complexity;
+  return this;
+}
+
+Benchmark* Benchmark::Complexity(BigOFunc* complexity) {
+  complexity_lambda_ = complexity;
+  complexity_ = oLambda;
+  return this;
+}
+
+Benchmark* Benchmark::Threads(int t) {
+  CHECK_GT(t, 0);
+  thread_counts_.push_back(t);
+  return this;
+}
+
+Benchmark* Benchmark::ThreadRange(int min_threads, int max_threads) {
+  CHECK_GT(min_threads, 0);
+  CHECK_GE(max_threads, min_threads);
+
+  AddRange(&thread_counts_, min_threads, max_threads, 2);
+  return this;
+}
+
+Benchmark* Benchmark::DenseThreadRange(int min_threads, int max_threads,
+                                       int stride) {
+  CHECK_GT(min_threads, 0);
+  CHECK_GE(max_threads, min_threads);
+  CHECK_GE(stride, 1);
+
+  for (auto i = min_threads; i < max_threads; i += stride) {
+    thread_counts_.push_back(i);
+  }
+  thread_counts_.push_back(max_threads);
+  return this;
+}
+
+Benchmark* Benchmark::ThreadPerCpu() {
+  static int num_cpus = NumCPUs();
+  thread_counts_.push_back(num_cpus);
+  return this;
+}
+
+void Benchmark::SetName(const char* name) { name_ = name; }
+
+int Benchmark::ArgsCnt() const {
+  if (args_.empty()) {
+    if (arg_names_.empty()) return -1;
+    return static_cast<int>(arg_names_.size());
+  }
+  return static_cast<int>(args_.front().size());
+}
+
+//=============================================================================//
+//                            FunctionBenchmark
+//=============================================================================//
+
+void FunctionBenchmark::Run(State& st) { func_(st); }
+
+}  // end namespace internal
+}  // end namespace benchmark
diff --git a/src/check.h b/src/check.h
index 4572bab..6f1fe0c 100644
--- a/src/check.h
+++ b/src/check.h
@@ -13,53 +13,52 @@
 typedef void(AbortHandlerT)();
 
 inline AbortHandlerT*& GetAbortHandler() {
-    static AbortHandlerT* handler = &std::abort;
-    return handler;
+  static AbortHandlerT* handler = &std::abort;
+  return handler;
 }
 
 BENCHMARK_NORETURN inline void CallAbortHandler() {
-    GetAbortHandler()();
-    std::abort(); // fallback to enforce noreturn
+  GetAbortHandler()();
+  std::abort();  // fallback to enforce noreturn
 }
 
 // CheckHandler is the class constructed by failing CHECK macros. CheckHandler
 // will log information about the failures and abort when it is destructed.
 class CheckHandler {
-public:
+ public:
   CheckHandler(const char* check, const char* file, const char* func, int line)
-    : log_(GetErrorLogInstance())
-  {
-    log_ << file << ":" << line << ": " << func << ": Check `"
-          << check << "' failed. ";
+      : log_(GetErrorLogInstance()) {
+    log_ << file << ":" << line << ": " << func << ": Check `" << check
+         << "' failed. ";
   }
 
-  std::ostream& GetLog() {
-    return log_;
-  }
+  LogType& GetLog() { return log_; }
 
   BENCHMARK_NORETURN ~CheckHandler() BENCHMARK_NOEXCEPT_OP(false) {
-      log_ << std::endl;
-      CallAbortHandler();
+    log_ << std::endl;
+    CallAbortHandler();
   }
 
-  CheckHandler & operator=(const CheckHandler&) = delete;
+  CheckHandler& operator=(const CheckHandler&) = delete;
   CheckHandler(const CheckHandler&) = delete;
   CheckHandler() = delete;
-private:
-  std::ostream& log_;
+
+ private:
+  LogType& log_;
 };
 
-} // end namespace internal
-} // end namespace benchmark
+}  // end namespace internal
+}  // end namespace benchmark
 
 // The CHECK macro returns a std::ostream object that can have extra information
 // written to it.
 #ifndef NDEBUG
-# define CHECK(b)  (b ? ::benchmark::internal::GetNullLogInstance()        \
-                      : ::benchmark::internal::CheckHandler(               \
-                          #b, __FILE__, __func__, __LINE__).GetLog())
+#define CHECK(b)                                                             \
+  (b ? ::benchmark::internal::GetNullLogInstance()                           \
+     : ::benchmark::internal::CheckHandler(#b, __FILE__, __func__, __LINE__) \
+           .GetLog())
 #else
-# define CHECK(b) ::benchmark::internal::GetNullLogInstance()
+#define CHECK(b) ::benchmark::internal::GetNullLogInstance()
 #endif
 
 #define CHECK_EQ(a, b) CHECK((a) == (b))
diff --git a/src/colorprint.cc b/src/colorprint.cc
index efb8626..513376b 100644
--- a/src/colorprint.cc
+++ b/src/colorprint.cc
@@ -16,19 +16,20 @@
 
 #include <cstdarg>
 #include <cstdio>
-#include <cstdarg>
-#include <string>
+#include <cstdlib>
+#include <cstring>
 #include <memory>
+#include <string>
 
-#include "commandlineflags.h"
 #include "check.h"
 #include "internal_macros.h"
 
 #ifdef BENCHMARK_OS_WINDOWS
 #include <Windows.h>
-#endif
-
-DECLARE_bool(color_print);
+#include <io.h>
+#else
+#include <unistd.h>
+#endif  // BENCHMARK_OS_WINDOWS
 
 namespace benchmark {
 namespace {
@@ -81,7 +82,7 @@
 
 }  // end namespace
 
-std::string FormatString(const char *msg, va_list args) {
+std::string FormatString(const char* msg, va_list args) {
   // we might need a second shot at this, so pre-emptivly make a copy
   va_list args_cp;
   va_copy(args_cp, args);
@@ -95,13 +96,13 @@
   // currently there is no error handling for failure, so this is hack.
   CHECK(ret >= 0);
 
-  if (ret == 0) // handle empty expansion
+  if (ret == 0)  // handle empty expansion
     return {};
   else if (static_cast<size_t>(ret) < size)
     return local_buff;
   else {
     // we did not provide a long enough buffer on our first attempt.
-    size = (size_t)ret + 1; // + 1 for the null byte
+    size = (size_t)ret + 1;  // + 1 for the null byte
     std::unique_ptr<char[]> buff(new char[size]);
     ret = std::vsnprintf(buff.get(), size, msg, args);
     CHECK(ret > 0 && ((size_t)ret) < size);
@@ -109,7 +110,7 @@
   }
 }
 
-std::string FormatString(const char *msg, ...) {
+std::string FormatString(const char* msg, ...) {
   va_list args;
   va_start(args, msg);
   auto tmp = FormatString(msg, args);
@@ -120,14 +121,15 @@
 void ColorPrintf(std::ostream& out, LogColor color, const char* fmt, ...) {
   va_list args;
   va_start(args, fmt);
+  ColorPrintf(out, color, fmt, args);
+  va_end(args);
+}
 
-  if (!FLAGS_color_print) {
-    out << FormatString(fmt, args);
-    va_end(args);
-    return;
-  }
-
+void ColorPrintf(std::ostream& out, LogColor color, const char* fmt,
+                 va_list args) {
 #ifdef BENCHMARK_OS_WINDOWS
+  ((void)out);  // suppress unused warning
+
   const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE);
 
   // Gets the current text color.
@@ -151,8 +153,36 @@
   if (color_code) out << FormatString("\033[0;3%sm", color_code);
   out << FormatString(fmt, args) << "\033[m";
 #endif
+}
 
-  va_end(args);
+bool IsColorTerminal() {
+#if BENCHMARK_OS_WINDOWS
+  // On Windows the TERM variable is usually not set, but the
+  // console there does support colors.
+  return 0 != _isatty(_fileno(stdout));
+#else
+  // On non-Windows platforms, we rely on the TERM variable. This list of
+  // supported TERM values is copied from Google Test:
+  // <https://github.com/google/googletest/blob/master/googletest/src/gtest.cc#L2925>.
+  const char* const SUPPORTED_TERM_VALUES[] = {
+      "xterm",         "xterm-color",     "xterm-256color",
+      "screen",        "screen-256color", "tmux",
+      "tmux-256color", "rxvt-unicode",    "rxvt-unicode-256color",
+      "linux",         "cygwin",
+  };
+
+  const char* const term = getenv("TERM");
+
+  bool term_supports_color = false;
+  for (const char* candidate : SUPPORTED_TERM_VALUES) {
+    if (term && 0 == strcmp(term, candidate)) {
+      term_supports_color = true;
+      break;
+    }
+  }
+
+  return 0 != isatty(fileno(stdout)) && term_supports_color;
+#endif  // BENCHMARK_OS_WINDOWS
 }
 
 }  // end namespace benchmark
diff --git a/src/colorprint.h b/src/colorprint.h
index 2b3c082..9f6fab9 100644
--- a/src/colorprint.h
+++ b/src/colorprint.h
@@ -2,8 +2,8 @@
 #define BENCHMARK_COLORPRINT_H_
 
 #include <cstdarg>
-#include <string>
 #include <iostream>
+#include <string>
 
 namespace benchmark {
 enum LogColor {
@@ -20,8 +20,14 @@
 std::string FormatString(const char* msg, va_list args);
 std::string FormatString(const char* msg, ...);
 
+void ColorPrintf(std::ostream& out, LogColor color, const char* fmt,
+                 va_list args);
 void ColorPrintf(std::ostream& out, LogColor color, const char* fmt, ...);
 
+// Returns true if stdout appears to be a terminal that supports colored
+// output, false otherwise.
+bool IsColorTerminal();
+
 }  // end namespace benchmark
 
 #endif  // BENCHMARK_COLORPRINT_H_
diff --git a/src/commandlineflags.cc b/src/commandlineflags.cc
index 3e9a37a..72534e0 100644
--- a/src/commandlineflags.cc
+++ b/src/commandlineflags.cc
@@ -14,6 +14,7 @@
 
 #include "commandlineflags.h"
 
+#include <cctype>
 #include <cstdlib>
 #include <cstring>
 #include <iostream>
@@ -43,7 +44,7 @@
       // The parsed value overflows as a long.  (strtol() returns
       // LONG_MAX or LONG_MIN when the input overflows.)
       result != long_value
-          // The parsed value overflows as an Int32.
+      // The parsed value overflows as an Int32.
       ) {
     std::cerr << src_text << " is expected to be a 32-bit integer, "
               << "but actually has value \"" << str << "\", "
@@ -74,17 +75,6 @@
   return true;
 }
 
-inline const char* GetEnv(const char* name) {
-#if defined(__BORLANDC__) || defined(__SunOS_5_8) || defined(__SunOS_5_9)
-  // Environment variables which we programmatically clear will be set to the
-  // empty string rather than unset (nullptr).  Handle that case.
-  const char* const env = getenv(name);
-  return (env != nullptr && env[0] != '\0') ? env : nullptr;
-#else
-  return getenv(name);
-#endif
-}
-
 // Returns the name of the environment variable corresponding to the
 // given flag.  For example, FlagToEnvVar("foo") will return
 // "BENCHMARK_FOO" in the open-source version.
@@ -104,8 +94,9 @@
 // The value is considered true iff it's not "0".
 bool BoolFromEnv(const char* flag, bool default_value) {
   const std::string env_var = FlagToEnvVar(flag);
-  const char* const string_value = GetEnv(env_var.c_str());
-  return string_value == nullptr ? default_value : strcmp(string_value, "0") != 0;
+  const char* const string_value = getenv(env_var.c_str());
+  return string_value == nullptr ? default_value
+                                 : strcmp(string_value, "0") != 0;
 }
 
 // Reads and returns a 32-bit integer stored in the environment
@@ -113,7 +104,7 @@
 // doesn't represent a valid 32-bit integer, returns default_value.
 int32_t Int32FromEnv(const char* flag, int32_t default_value) {
   const std::string env_var = FlagToEnvVar(flag);
-  const char* const string_value = GetEnv(env_var.c_str());
+  const char* const string_value = getenv(env_var.c_str());
   if (string_value == nullptr) {
     // The environment variable is not set.
     return default_value;
@@ -133,7 +124,7 @@
 // the given flag; if it's not set, returns default_value.
 const char* StringFromEnv(const char* flag, const char* default_value) {
   const std::string env_var = FlagToEnvVar(flag);
-  const char* const value = GetEnv(env_var.c_str());
+  const char* const value = getenv(env_var.c_str());
   return value == nullptr ? default_value : value;
 }
 
@@ -175,7 +166,7 @@
   if (value_str == nullptr) return false;
 
   // Converts the string value to a bool.
-  *value = !(*value_str == '0' || *value_str == 'f' || *value_str == 'F');
+  *value = IsTruthyFlagValue(value_str);
   return true;
 }
 
@@ -217,4 +208,11 @@
 bool IsFlag(const char* str, const char* flag) {
   return (ParseFlagValue(str, flag, true) != nullptr);
 }
+
+bool IsTruthyFlagValue(const std::string& str) {
+  if (str.empty()) return true;
+  char ch = str[0];
+  return isalnum(ch) &&
+         !(ch == '0' || ch == 'f' || ch == 'F' || ch == 'n' || ch == 'N');
+}
 }  // end namespace benchmark
diff --git a/src/commandlineflags.h b/src/commandlineflags.h
index 34b9c6f..945c9a9 100644
--- a/src/commandlineflags.h
+++ b/src/commandlineflags.h
@@ -38,8 +38,7 @@
 // Parses a string for a bool flag, in the form of either
 // "--flag=value" or "--flag".
 //
-// In the former case, the value is taken as true as long as it does
-// not start with '0', 'f', or 'F'.
+// In the former case, the value is taken as true if it passes IsTruthyValue().
 //
 // In the latter case, the value is taken as true.
 //
@@ -71,6 +70,10 @@
 // Returns true if the string matches the flag.
 bool IsFlag(const char* str, const char* flag);
 
+// Returns true unless value starts with one of: '0', 'f', 'F', 'n' or 'N', or
+// some non-alphanumeric character. As a special case, also returns true if
+// value is the empty string.
+bool IsTruthyFlagValue(const std::string& value);
 }  // end namespace benchmark
 
 #endif  // BENCHMARK_COMMANDLINEFLAGS_H_
diff --git a/src/complexity.cc b/src/complexity.cc
index b42bd38..dfab791 100644
--- a/src/complexity.cc
+++ b/src/complexity.cc
@@ -31,9 +31,9 @@
     case oN:
       return [](int n) -> double { return n; };
     case oNSquared:
-      return [](int n) -> double { return n * n; };
+      return [](int n) -> double { return std::pow(n, 2); };
     case oNCubed:
-      return [](int n) -> double { return n * n * n; };
+      return [](int n) -> double { return std::pow(n, 3); };
     case oLogN:
       return [](int n) { return std::log2(n); };
     case oNLogN:
@@ -119,8 +119,7 @@
 //                  this one. If it is oAuto, it will be calculated the best
 //                  fitting curve.
 LeastSq MinimalLeastSq(const std::vector<int>& n,
-                       const std::vector<double>& time,
-                       const BigO complexity) {
+                       const std::vector<double>& time, const BigO complexity) {
   CHECK_EQ(n.size(), time.size());
   CHECK_GE(n.size(), 2);  // Do not compute fitting curve is less than two
                           // benchmark runs are given
@@ -196,6 +195,7 @@
       cpu_accumulated_time_stat.Mean() * run_iterations;
   mean_data.bytes_per_second = bytes_per_second_stat.Mean();
   mean_data.items_per_second = items_per_second_stat.Mean();
+  mean_data.time_unit = reports[0].time_unit;
 
   // Only add label to mean/stddev if it is same for all runs
   mean_data.report_label = reports[0].report_label;
@@ -214,6 +214,7 @@
   stddev_data.cpu_accumulated_time = cpu_accumulated_time_stat.StdDev();
   stddev_data.bytes_per_second = bytes_per_second_stat.StdDev();
   stddev_data.items_per_second = items_per_second_stat.StdDev();
+  stddev_data.time_unit = reports[0].time_unit;
 
   results.push_back(mean_data);
   results.push_back(stddev_data);
diff --git a/src/complexity.h b/src/complexity.h
index 85cc125..23cd9bb 100644
--- a/src/complexity.h
+++ b/src/complexity.h
@@ -47,10 +47,7 @@
 //                   parameter will return the best fitting curve detected.
 
 struct LeastSq {
-  LeastSq() :
-    coef(0.0),
-    rms(0.0),
-    complexity(oNone) {}
+  LeastSq() : coef(0.0), rms(0.0), complexity(oNone) {}
 
   double coef;
   double rms;
@@ -60,5 +57,5 @@
 // Function to return an string for the calculated complexity
 std::string GetBigOString(BigO complexity);
 
-} // end namespace benchmark
-#endif // COMPLEXITY_H_
+}  // end namespace benchmark
+#endif  // COMPLEXITY_H_
diff --git a/src/console_reporter.cc b/src/console_reporter.cc
index 080c324..7e0cca3 100644
--- a/src/console_reporter.cc
+++ b/src/console_reporter.cc
@@ -28,9 +28,7 @@
 #include "commandlineflags.h"
 #include "internal_macros.h"
 #include "string_util.h"
-#include "walltime.h"
-
-DECLARE_bool(color_print);
+#include "timers.h"
 
 namespace benchmark {
 
@@ -40,37 +38,47 @@
   PrintBasicContext(&GetErrorStream(), context);
 
 #ifdef BENCHMARK_OS_WINDOWS
-  if (FLAGS_color_print && &std::cout != &GetOutputStream()) {
-      GetErrorStream() << "Color printing is only supported for stdout on windows."
-                          " Disabling color printing\n";
-      FLAGS_color_print = false;
+  if (color_output_ && &std::cout != &GetOutputStream()) {
+    GetErrorStream()
+        << "Color printing is only supported for stdout on windows."
+           " Disabling color printing\n";
+    color_output_ = false;
   }
 #endif
-  std::string str = FormatString("%-*s %13s %13s %10s\n",
-                             static_cast<int>(name_field_width_), "Benchmark",
-                             "Time", "CPU", "Iterations");
+  std::string str =
+      FormatString("%-*s %13s %13s %10s\n", static_cast<int>(name_field_width_),
+                   "Benchmark", "Time", "CPU", "Iterations");
   GetOutputStream() << str << std::string(str.length() - 1, '-') << "\n";
 
   return true;
 }
 
 void ConsoleReporter::ReportRuns(const std::vector<Run>& reports) {
-  for (const auto& run : reports)
-    PrintRunData(run);
+  for (const auto& run : reports) PrintRunData(run);
+}
+
+static void IgnoreColorPrint(std::ostream& out, LogColor, const char* fmt,
+                             ...) {
+  va_list args;
+  va_start(args, fmt);
+  out << FormatString(fmt, args);
+  va_end(args);
 }
 
 void ConsoleReporter::PrintRunData(const Run& result) {
+  typedef void(PrinterFn)(std::ostream&, LogColor, const char*, ...);
   auto& Out = GetOutputStream();
-
+  PrinterFn* printer =
+      color_output_ ? (PrinterFn*)ColorPrintf : IgnoreColorPrint;
   auto name_color =
       (result.report_big_o || result.report_rms) ? COLOR_BLUE : COLOR_GREEN;
-  ColorPrintf(Out, name_color, "%-*s ", name_field_width_,
-              result.benchmark_name.c_str());
+  printer(Out, name_color, "%-*s ", name_field_width_,
+          result.benchmark_name.c_str());
 
   if (result.error_occurred) {
-    ColorPrintf(Out, COLOR_RED, "ERROR OCCURRED: \'%s\'",
-                result.error_message.c_str());
-    ColorPrintf(Out, COLOR_DEFAULT, "\n");
+    printer(Out, COLOR_RED, "ERROR OCCURRED: \'%s\'",
+            result.error_message.c_str());
+    printer(Out, COLOR_DEFAULT, "\n");
     return;
   }
   // Format bytes per second
@@ -82,43 +90,43 @@
   // Format items per second
   std::string items;
   if (result.items_per_second > 0) {
-    items = StrCat(" ", HumanReadableNumber(result.items_per_second),
-                   " items/s");
- }
+    items =
+        StrCat(" ", HumanReadableNumber(result.items_per_second), " items/s");
+  }
 
   const double real_time = result.GetAdjustedRealTime();
   const double cpu_time = result.GetAdjustedCPUTime();
 
   if (result.report_big_o) {
     std::string big_o = GetBigOString(result.complexity);
-    ColorPrintf(Out, COLOR_YELLOW, "%10.2f %s %10.2f %s ", real_time,
-                big_o.c_str(), cpu_time, big_o.c_str());
+    printer(Out, COLOR_YELLOW, "%10.2f %s %10.2f %s ", real_time, big_o.c_str(),
+            cpu_time, big_o.c_str());
   } else if (result.report_rms) {
-    ColorPrintf(Out, COLOR_YELLOW, "%10.0f %% %10.0f %% ", real_time * 100,
-                cpu_time * 100);
+    printer(Out, COLOR_YELLOW, "%10.0f %% %10.0f %% ", real_time * 100,
+            cpu_time * 100);
   } else {
     const char* timeLabel = GetTimeUnitString(result.time_unit);
-    ColorPrintf(Out, COLOR_YELLOW, "%10.0f %s %10.0f %s ", real_time, timeLabel,
-                cpu_time, timeLabel);
+    printer(Out, COLOR_YELLOW, "%10.0f %s %10.0f %s ", real_time, timeLabel,
+            cpu_time, timeLabel);
   }
 
   if (!result.report_big_o && !result.report_rms) {
-    ColorPrintf(Out, COLOR_CYAN, "%10lld", result.iterations);
+    printer(Out, COLOR_CYAN, "%10lld", result.iterations);
   }
 
   if (!rate.empty()) {
-    ColorPrintf(Out, COLOR_DEFAULT, " %*s", 13, rate.c_str());
+    printer(Out, COLOR_DEFAULT, " %*s", 13, rate.c_str());
   }
 
   if (!items.empty()) {
-    ColorPrintf(Out, COLOR_DEFAULT, " %*s", 18, items.c_str());
+    printer(Out, COLOR_DEFAULT, " %*s", 18, items.c_str());
   }
 
   if (!result.report_label.empty()) {
-    ColorPrintf(Out, COLOR_DEFAULT, " %s", result.report_label.c_str());
+    printer(Out, COLOR_DEFAULT, " %s", result.report_label.c_str());
   }
 
-  ColorPrintf(Out, COLOR_DEFAULT, "\n");
+  printer(Out, COLOR_DEFAULT, "\n");
 }
 
 }  // end namespace benchmark
diff --git a/src/csv_reporter.cc b/src/csv_reporter.cc
index 7bc7ef3..18ab3b6 100644
--- a/src/csv_reporter.cc
+++ b/src/csv_reporter.cc
@@ -23,7 +23,7 @@
 #include <vector>
 
 #include "string_util.h"
-#include "walltime.h"
+#include "timers.h"
 
 // File format reference: http://edoceo.com/utilitas/csv-file-format.
 
@@ -31,38 +31,28 @@
 
 namespace {
 std::vector<std::string> elements = {
-  "name",
-  "iterations",
-  "real_time",
-  "cpu_time",
-  "time_unit",
-  "bytes_per_second",
-  "items_per_second",
-  "label",
-  "error_occurred",
-  "error_message"
-};
+    "name",           "iterations",       "real_time",        "cpu_time",
+    "time_unit",      "bytes_per_second", "items_per_second", "label",
+    "error_occurred", "error_message"};
 }
 
 bool CSVReporter::ReportContext(const Context& context) {
   PrintBasicContext(&GetErrorStream(), context);
 
   std::ostream& Out = GetOutputStream();
-  for (auto B = elements.begin(); B != elements.end(); ) {
+  for (auto B = elements.begin(); B != elements.end();) {
     Out << *B++;
-    if (B != elements.end())
-      Out << ",";
+    if (B != elements.end()) Out << ",";
   }
   Out << "\n";
   return true;
 }
 
-void CSVReporter::ReportRuns(const std::vector<Run> & reports) {
-  for (const auto& run : reports)
-    PrintRunData(run);
+void CSVReporter::ReportRuns(const std::vector<Run>& reports) {
+  for (const auto& run : reports) PrintRunData(run);
 }
 
-void CSVReporter::PrintRunData(const Run & run) {
+void CSVReporter::PrintRunData(const Run& run) {
   std::ostream& Out = GetOutputStream();
 
   // Field with embedded double-quote characters must be doubled and the field
diff --git a/src/internal_macros.h b/src/internal_macros.h
index 1080ac9..e8efcbb 100644
--- a/src/internal_macros.h
+++ b/src/internal_macros.h
@@ -4,37 +4,39 @@
 #include "benchmark/macros.h"
 
 #ifndef __has_feature
-# define __has_feature(x) 0
-#endif
-
-#if __has_feature(cxx_attributes)
-# define BENCHMARK_NORETURN [[noreturn]]
-#elif defined(__GNUC__)
-# define BENCHMARK_NORETURN __attribute__((noreturn))
-#else
-# define BENCHMARK_NORETURN
-#endif
-
-#if defined(__CYGWIN__)
-# define BENCHMARK_OS_CYGWIN 1
-#elif defined(_WIN32)
-# define BENCHMARK_OS_WINDOWS 1
-#elif defined(__APPLE__)
-// TODO(ericwf) This doesn't actually check that it is a Mac OSX system. Just
-// that it is an apple system.
-# define BENCHMARK_OS_MACOSX 1
-#elif defined(__FreeBSD__)
-# define BENCHMARK_OS_FREEBSD 1
-#elif defined(__linux__)
-# define BENCHMARK_OS_LINUX 1
+#define __has_feature(x) 0
 #endif
 
 #if defined(__clang__)
-# define COMPILER_CLANG
+#define COMPILER_CLANG
 #elif defined(_MSC_VER)
-# define COMPILER_MSVC
+#define COMPILER_MSVC
 #elif defined(__GNUC__)
-# define COMPILER_GCC
+#define COMPILER_GCC
 #endif
 
-#endif // BENCHMARK_INTERNAL_MACROS_H_
+#if __has_feature(cxx_attributes)
+#define BENCHMARK_NORETURN [[noreturn]]
+#elif defined(__GNUC__)
+#define BENCHMARK_NORETURN __attribute__((noreturn))
+#elif defined(COMPILER_MSVC)
+#define BENCHMARK_NORETURN __declspec(noreturn)
+#else
+#define BENCHMARK_NORETURN
+#endif
+
+#if defined(__CYGWIN__)
+#define BENCHMARK_OS_CYGWIN 1
+#elif defined(_WIN32)
+#define BENCHMARK_OS_WINDOWS 1
+#elif defined(__APPLE__)
+// TODO(ericwf) This doesn't actually check that it is a Mac OSX system. Just
+// that it is an apple system.
+#define BENCHMARK_OS_MACOSX 1
+#elif defined(__FreeBSD__)
+#define BENCHMARK_OS_FREEBSD 1
+#elif defined(__linux__)
+#define BENCHMARK_OS_LINUX 1
+#endif
+
+#endif  // BENCHMARK_INTERNAL_MACROS_H_
diff --git a/src/json_reporter.cc b/src/json_reporter.cc
index 485d305..cea5f9b 100644
--- a/src/json_reporter.cc
+++ b/src/json_reporter.cc
@@ -23,7 +23,7 @@
 #include <vector>
 
 #include "string_util.h"
-#include "walltime.h"
+#include "timers.h"
 
 namespace benchmark {
 
@@ -47,11 +47,13 @@
   return ss.str();
 }
 
-int64_t RoundDouble(double v) {
-    return static_cast<int64_t>(v + 0.5);
+std::string FormatKV(std::string const& key, double value) {
+  return StringPrintF("\"%s\": %.2f", key.c_str(), value);
 }
 
-} // end namespace
+int64_t RoundDouble(double v) { return static_cast<int64_t>(v + 0.5); }
+
+}  // end namespace
 
 bool JSONReporter::ReportContext(const Context& context) {
   std::ostream& out = GetOutputStream();
@@ -66,14 +68,11 @@
   std::string walltime_value = LocalDateTimeString();
   out << indent << FormatKV("date", walltime_value) << ",\n";
 
-  out << indent
-      << FormatKV("num_cpus", static_cast<int64_t>(context.num_cpus))
+  out << indent << FormatKV("num_cpus", static_cast<int64_t>(context.num_cpus))
       << ",\n";
-  out << indent
-      << FormatKV("mhz_per_cpu", RoundDouble(context.mhz_per_cpu))
+  out << indent << FormatKV("mhz_per_cpu", RoundDouble(context.mhz_per_cpu))
       << ",\n";
-  out << indent
-      << FormatKV("cpu_scaling_enabled", context.cpu_scaling_enabled)
+  out << indent << FormatKV("cpu_scaling_enabled", context.cpu_scaling_enabled)
       << ",\n";
 
 #if defined(NDEBUG)
@@ -118,28 +117,20 @@
 void JSONReporter::PrintRunData(Run const& run) {
   std::string indent(6, ' ');
   std::ostream& out = GetOutputStream();
-    out << indent
-        << FormatKV("name", run.benchmark_name)
-        << ",\n";
-    if (run.error_occurred) {
-        out << indent
-            << FormatKV("error_occurred", run.error_occurred)
-            << ",\n";
-        out << indent
-            << FormatKV("error_message", run.error_message)
-            << ",\n";
-    }
+  out << indent << FormatKV("name", run.benchmark_name) << ",\n";
+  if (run.error_occurred) {
+    out << indent << FormatKV("error_occurred", run.error_occurred) << ",\n";
+    out << indent << FormatKV("error_message", run.error_message) << ",\n";
+  }
   if (!run.report_big_o && !run.report_rms) {
-        out << indent
-            << FormatKV("iterations", run.iterations)
-            << ",\n";
-        out << indent
-            << FormatKV("real_time", RoundDouble(run.GetAdjustedRealTime()))
-            << ",\n";
-        out << indent
-            << FormatKV("cpu_time", RoundDouble(run.GetAdjustedCPUTime()));
-        out << ",\n" << indent
-            << FormatKV("time_unit", GetTimeUnitString(run.time_unit));
+    out << indent << FormatKV("iterations", run.iterations) << ",\n";
+    out << indent
+        << FormatKV("real_time", RoundDouble(run.GetAdjustedRealTime()))
+        << ",\n";
+    out << indent
+        << FormatKV("cpu_time", RoundDouble(run.GetAdjustedCPUTime()));
+    out << ",\n"
+        << indent << FormatKV("time_unit", GetTimeUnitString(run.time_unit));
   } else if (run.report_big_o) {
     out << indent
         << FormatKV("cpu_coefficient", RoundDouble(run.GetAdjustedCPUTime()))
@@ -147,15 +138,11 @@
     out << indent
         << FormatKV("real_coefficient", RoundDouble(run.GetAdjustedRealTime()))
         << ",\n";
+    out << indent << FormatKV("big_o", GetBigOString(run.complexity)) << ",\n";
+    out << indent << FormatKV("time_unit", GetTimeUnitString(run.time_unit));
+  } else if (run.report_rms) {
     out << indent
-            << FormatKV("big_o", GetBigOString(run.complexity))
-            << ",\n";
-        out << indent
-            << FormatKV("time_unit", GetTimeUnitString(run.time_unit));
-    } else if(run.report_rms) {
-        out << indent
-            << FormatKV("rms", RoundDouble(run.GetAdjustedCPUTime()*100))
-            << '%';
+        << FormatKV("rms", run.GetAdjustedCPUTime());
   }
   if (run.bytes_per_second > 0.0) {
     out << ",\n"
@@ -168,9 +155,7 @@
         << FormatKV("items_per_second", RoundDouble(run.items_per_second));
   }
   if (!run.report_label.empty()) {
-    out << ",\n"
-        << indent
-        << FormatKV("label", run.report_label);
+    out << ",\n" << indent << FormatKV("label", run.report_label);
   }
   out << '\n';
 }
diff --git a/src/log.cc b/src/log.cc
deleted file mode 100644
index b660309..0000000
--- a/src/log.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-#include "log.h"
-
-#include <iostream>
-
-namespace benchmark {
-namespace internal {
-
-int& LoggingLevelImp() {
-    static int level = 0;
-    return level;
-}
-
-void SetLogLevel(int value) {
-    LoggingLevelImp() = value;
-}
-
-int GetLogLevel() {
-    return LoggingLevelImp();
-}
-
-class NullLogBuffer : public std::streambuf
-{
-public:
-  int overflow(int c) {
-    return c;
-  }
-};
-
-std::ostream& GetNullLogInstance() {
-  static NullLogBuffer log_buff;
-  static std::ostream null_log(&log_buff);
-  return null_log;
-}
-
-std::ostream& GetErrorLogInstance() {
-  return std::clog;
-}
-
-} // end namespace internal
-} // end namespace benchmark
\ No newline at end of file
diff --git a/src/log.h b/src/log.h
index 3777810..978cb0b 100644
--- a/src/log.h
+++ b/src/log.h
@@ -1,28 +1,73 @@
 #ifndef BENCHMARK_LOG_H_
 #define BENCHMARK_LOG_H_
 
+#include <iostream>
 #include <ostream>
 
+#include "benchmark/macros.h"
+
 namespace benchmark {
 namespace internal {
 
-int GetLogLevel();
-void SetLogLevel(int level);
+typedef std::basic_ostream<char>&(EndLType)(std::basic_ostream<char>&);
 
-std::ostream& GetNullLogInstance();
-std::ostream& GetErrorLogInstance();
+class LogType {
+  friend LogType& GetNullLogInstance();
+  friend LogType& GetErrorLogInstance();
 
-inline std::ostream& GetLogInstanceForLevel(int level) {
-  if (level <= GetLogLevel()) {
+  // FIXME: Add locking to output.
+  template <class Tp>
+  friend LogType& operator<<(LogType&, Tp const&);
+  friend LogType& operator<<(LogType&, EndLType*);
+
+ private:
+  LogType(std::ostream* out) : out_(out) {}
+  std::ostream* out_;
+  BENCHMARK_DISALLOW_COPY_AND_ASSIGN(LogType);
+};
+
+template <class Tp>
+LogType& operator<<(LogType& log, Tp const& value) {
+  if (log.out_) {
+    *log.out_ << value;
+  }
+  return log;
+}
+
+inline LogType& operator<<(LogType& log, EndLType* m) {
+  if (log.out_) {
+    *log.out_ << m;
+  }
+  return log;
+}
+
+inline int& LogLevel() {
+  static int log_level = 0;
+  return log_level;
+}
+
+inline LogType& GetNullLogInstance() {
+  static LogType log(nullptr);
+  return log;
+}
+
+inline LogType& GetErrorLogInstance() {
+  static LogType log(&std::clog);
+  return log;
+}
+
+inline LogType& GetLogInstanceForLevel(int level) {
+  if (level <= LogLevel()) {
     return GetErrorLogInstance();
   }
   return GetNullLogInstance();
 }
 
-} // end namespace internal
-} // end namespace benchmark
+}  // end namespace internal
+}  // end namespace benchmark
 
-#define VLOG(x) (::benchmark::internal::GetLogInstanceForLevel(x) \
-                 << "-- LOG(" << x << "): ")
+#define VLOG(x)                                                               \
+  (::benchmark::internal::GetLogInstanceForLevel(x) << "-- LOG(" << x << "):" \
+                                                                         " ")
 
 #endif
\ No newline at end of file
diff --git a/src/mutex.h b/src/mutex.h
index f37ec35..5f461d0 100644
--- a/src/mutex.h
+++ b/src/mutex.h
@@ -1,28 +1,26 @@
 #ifndef BENCHMARK_MUTEX_H_
 #define BENCHMARK_MUTEX_H_
 
-#include <mutex>
 #include <condition_variable>
+#include <mutex>
+
+#include "check.h"
 
 // Enable thread safety attributes only with clang.
 // The attributes can be safely erased when compiling with other compilers.
 #if defined(HAVE_THREAD_SAFETY_ATTRIBUTES)
-#define THREAD_ANNOTATION_ATTRIBUTE__(x)   __attribute__((x))
+#define THREAD_ANNOTATION_ATTRIBUTE__(x) __attribute__((x))
 #else
-#define THREAD_ANNOTATION_ATTRIBUTE__(x)   // no-op
+#define THREAD_ANNOTATION_ATTRIBUTE__(x)  // no-op
 #endif
 
-#define CAPABILITY(x) \
-  THREAD_ANNOTATION_ATTRIBUTE__(capability(x))
+#define CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE__(capability(x))
 
-#define SCOPED_CAPABILITY \
-  THREAD_ANNOTATION_ATTRIBUTE__(scoped_lockable)
+#define SCOPED_CAPABILITY THREAD_ANNOTATION_ATTRIBUTE__(scoped_lockable)
 
-#define GUARDED_BY(x) \
-  THREAD_ANNOTATION_ATTRIBUTE__(guarded_by(x))
+#define GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(guarded_by(x))
 
-#define PT_GUARDED_BY(x) \
-  THREAD_ANNOTATION_ATTRIBUTE__(pt_guarded_by(x))
+#define PT_GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(pt_guarded_by(x))
 
 #define ACQUIRED_BEFORE(...) \
   THREAD_ANNOTATION_ATTRIBUTE__(acquired_before(__VA_ARGS__))
@@ -54,22 +52,18 @@
 #define TRY_ACQUIRE_SHARED(...) \
   THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_shared_capability(__VA_ARGS__))
 
-#define EXCLUDES(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(locks_excluded(__VA_ARGS__))
+#define EXCLUDES(...) THREAD_ANNOTATION_ATTRIBUTE__(locks_excluded(__VA_ARGS__))
 
-#define ASSERT_CAPABILITY(x) \
-  THREAD_ANNOTATION_ATTRIBUTE__(assert_capability(x))
+#define ASSERT_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE__(assert_capability(x))
 
 #define ASSERT_SHARED_CAPABILITY(x) \
   THREAD_ANNOTATION_ATTRIBUTE__(assert_shared_capability(x))
 
-#define RETURN_CAPABILITY(x) \
-  THREAD_ANNOTATION_ATTRIBUTE__(lock_returned(x))
+#define RETURN_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE__(lock_returned(x))
 
 #define NO_THREAD_SAFETY_ANALYSIS \
   THREAD_ANNOTATION_ATTRIBUTE__(no_thread_safety_analysis)
 
-
 namespace benchmark {
 
 typedef std::condition_variable Condition;
@@ -78,65 +72,84 @@
 // we can annotate them with thread safety attributes and use the
 // -Wthread-safety warning with clang. The standard library types cannot be
 // used directly because they do not provided the required annotations.
-class CAPABILITY("mutex") Mutex
-{
-public:
+class CAPABILITY("mutex") Mutex {
+ public:
   Mutex() {}
 
   void lock() ACQUIRE() { mut_.lock(); }
   void unlock() RELEASE() { mut_.unlock(); }
-  std::mutex& native_handle() {
-    return mut_;
-  }
-private:
+  std::mutex& native_handle() { return mut_; }
+
+ private:
   std::mutex mut_;
 };
 
-
-class SCOPED_CAPABILITY MutexLock
-{
+class SCOPED_CAPABILITY MutexLock {
   typedef std::unique_lock<std::mutex> MutexLockImp;
-public:
-  MutexLock(Mutex& m) ACQUIRE(m) : ml_(m.native_handle())
-  { }
+
+ public:
+  MutexLock(Mutex& m) ACQUIRE(m) : ml_(m.native_handle()) {}
   ~MutexLock() RELEASE() {}
   MutexLockImp& native_handle() { return ml_; }
-private:
+
+ private:
   MutexLockImp ml_;
 };
 
+class Barrier {
+ public:
+  Barrier(int num_threads) : running_threads_(num_threads) {}
 
-class Notification
-{
-public:
-  Notification() : notified_yet_(false) { }
-
-  void WaitForNotification() const EXCLUDES(mutex_) {
-    MutexLock m_lock(mutex_);
-    auto notified_fn = [this]() REQUIRES(mutex_) {
-                            return this->HasBeenNotified();
-                        };
-    cv_.wait(m_lock.native_handle(), notified_fn);
-  }
-
-  void Notify() EXCLUDES(mutex_) {
+  // Called by each thread
+  bool wait() EXCLUDES(lock_) {
+    bool last_thread = false;
     {
-      MutexLock lock(mutex_);
-      notified_yet_ = 1;
+      MutexLock ml(lock_);
+      last_thread = createBarrier(ml);
     }
-    cv_.notify_all();
+    if (last_thread) phase_condition_.notify_all();
+    return last_thread;
   }
 
-private:
-  bool HasBeenNotified() const REQUIRES(mutex_) {
-    return notified_yet_;
+  void removeThread() EXCLUDES(lock_) {
+    MutexLock ml(lock_);
+    --running_threads_;
+    if (entered_ != 0) phase_condition_.notify_all();
   }
 
-  mutable Mutex mutex_;
-  mutable std::condition_variable cv_;
-  bool notified_yet_ GUARDED_BY(mutex_);
+ private:
+  Mutex lock_;
+  Condition phase_condition_;
+  int running_threads_;
+
+  // State for barrier management
+  int phase_number_ = 0;
+  int entered_ = 0;  // Number of threads that have entered this barrier
+
+  // Enter the barrier and wait until all other threads have also
+  // entered the barrier.  Returns iff this is the last thread to
+  // enter the barrier.
+  bool createBarrier(MutexLock& ml) REQUIRES(lock_) {
+    CHECK_LT(entered_, running_threads_);
+    entered_++;
+    if (entered_ < running_threads_) {
+      // Wait for all threads to enter
+      int phase_number_cp = phase_number_;
+      auto cb = [this, phase_number_cp]() {
+        return this->phase_number_ > phase_number_cp ||
+               entered_ == running_threads_;  // A thread has aborted in error
+      };
+      phase_condition_.wait(ml.native_handle(), cb);
+      if (phase_number_ > phase_number_cp) return false;
+      // else (running_threads_ == entered_) and we are the last thread.
+    }
+    // Last thread has reached the barrier
+    phase_number_++;
+    entered_ = 0;
+    return true;
+  }
 };
 
-} // end namespace benchmark
+}  // end namespace benchmark
 
-#endif // BENCHMARK_MUTEX_H_
+#endif  // BENCHMARK_MUTEX_H_
diff --git a/src/re.h b/src/re.h
index af57a39..af4a498 100644
--- a/src/re.h
+++ b/src/re.h
@@ -26,13 +26,16 @@
 #endif
 #include <string>
 
+#include "check.h"
+
 namespace benchmark {
 
 // A wrapper around the POSIX regular expression API that provides automatic
 // cleanup
 class Regex {
  public:
-  Regex();
+  Regex() : init_(false) {}
+
   ~Regex();
 
   // Compile a regular expression matcher from spec.  Returns true on success.
@@ -43,18 +46,81 @@
 
   // Returns whether str matches the compiled regular expression.
   bool Match(const std::string& str);
+
  private:
   bool init_;
-  // Underlying regular expression object
+// Underlying regular expression object
 #if defined(HAVE_STD_REGEX)
   std::regex re_;
 #elif defined(HAVE_POSIX_REGEX) || defined(HAVE_GNU_POSIX_REGEX)
   regex_t re_;
 #else
-# error No regular expression backend implementation available
+#error No regular expression backend implementation available
 #endif
 };
 
+#if defined(HAVE_STD_REGEX)
+
+inline bool Regex::Init(const std::string& spec, std::string* error) {
+  try {
+    re_ = std::regex(spec, std::regex_constants::extended);
+
+    init_ = true;
+  } catch (const std::regex_error& e) {
+    if (error) {
+      *error = e.what();
+    }
+  }
+  return init_;
+}
+
+inline Regex::~Regex() {}
+
+inline bool Regex::Match(const std::string& str) {
+  if (!init_) {
+    return false;
+  }
+  return std::regex_search(str, re_);
+}
+
+#else
+inline bool Regex::Init(const std::string& spec, std::string* error) {
+  int ec = regcomp(&re_, spec.c_str(), REG_EXTENDED | REG_NOSUB);
+  if (ec != 0) {
+    if (error) {
+      size_t needed = regerror(ec, &re_, nullptr, 0);
+      char* errbuf = new char[needed];
+      regerror(ec, &re_, errbuf, needed);
+
+      // regerror returns the number of bytes necessary to null terminate
+      // the string, so we move that when assigning to error.
+      CHECK_NE(needed, 0);
+      error->assign(errbuf, needed - 1);
+
+      delete[] errbuf;
+    }
+
+    return false;
+  }
+
+  init_ = true;
+  return true;
+}
+
+inline Regex::~Regex() {
+  if (init_) {
+    regfree(&re_);
+  }
+}
+
+inline bool Regex::Match(const std::string& str) {
+  if (!init_) {
+    return false;
+  }
+  return regexec(&re_, str.c_str(), 0, nullptr, 0) == 0;
+}
+#endif
+
 }  // end namespace benchmark
 
 #endif  // BENCHMARK_RE_H_
diff --git a/src/re_posix.cc b/src/re_posix.cc
deleted file mode 100644
index 95b086f..0000000
--- a/src/re_posix.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "check.h"
-#include "re.h"
-
-namespace benchmark {
-
-Regex::Regex() : init_(false) { }
-
-bool Regex::Init(const std::string& spec, std::string* error) {
-  int ec = regcomp(&re_, spec.c_str(), REG_EXTENDED | REG_NOSUB);
-  if (ec != 0) {
-    if (error) {
-      size_t needed = regerror(ec, &re_, nullptr, 0);
-      char* errbuf = new char[needed];
-      regerror(ec, &re_, errbuf, needed);
-
-      // regerror returns the number of bytes necessary to null terminate
-      // the string, so we move that when assigning to error.
-      CHECK_NE(needed, 0);
-      error->assign(errbuf, needed - 1);
-
-      delete[] errbuf;
-    }
-
-    return false;
-  }
-
-  init_ = true;
-  return true;
-}
-
-Regex::~Regex() {
-  if (init_) {
-    regfree(&re_);
-  }
-}
-
-bool Regex::Match(const std::string& str) {
-  if (!init_) {
-    return false;
-  }
-
-  return regexec(&re_, str.c_str(), 0, nullptr, 0) == 0;
-}
-
-}  // end namespace benchmark
diff --git a/src/re_std.cc b/src/re_std.cc
deleted file mode 100644
index cfd7a21..0000000
--- a/src/re_std.cc
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "re.h"
-
-namespace benchmark {
-
-Regex::Regex() : init_(false) { }
-
-bool Regex::Init(const std::string& spec, std::string* error) {
-  try {
-    re_ = std::regex(spec, std::regex_constants::extended);
-
-    init_ = true;
-  } catch (const std::regex_error& e) {
-    if (error) {
-      *error = e.what();
-    }
-  }
-  return init_;
-}
-
-Regex::~Regex() { }
-
-bool Regex::Match(const std::string& str) {
-  if (!init_) {
-    return false;
-  }
-
-  return std::regex_search(str, re_);
-}
-
-}  // end namespace benchmark
diff --git a/src/reporter.cc b/src/reporter.cc
index 5187859..6474242 100644
--- a/src/reporter.cc
+++ b/src/reporter.cc
@@ -13,13 +13,13 @@
 // limitations under the License.
 
 #include "benchmark/reporter.h"
-#include "walltime.h"
+#include "timers.h"
 
 #include <cstdlib>
 
 #include <iostream>
-#include <vector>
 #include <tuple>
+#include <vector>
 
 #include "check.h"
 #include "stat.h"
@@ -27,49 +27,42 @@
 namespace benchmark {
 
 BenchmarkReporter::BenchmarkReporter()
-    : output_stream_(&std::cout), error_stream_(&std::cerr)
-{
-}
+    : output_stream_(&std::cout), error_stream_(&std::cerr) {}
 
-BenchmarkReporter::~BenchmarkReporter() {
-}
+BenchmarkReporter::~BenchmarkReporter() {}
 
 void BenchmarkReporter::PrintBasicContext(std::ostream *out_ptr,
                                           Context const &context) {
   CHECK(out_ptr) << "cannot be null";
-  auto& Out = *out_ptr;
+  auto &Out = *out_ptr;
 
   Out << "Run on (" << context.num_cpus << " X " << context.mhz_per_cpu
-            << " MHz CPU " << ((context.num_cpus > 1) ? "s" : "") << ")\n";
+      << " MHz CPU " << ((context.num_cpus > 1) ? "s" : "") << ")\n";
 
   Out << LocalDateTimeString() << "\n";
 
   if (context.cpu_scaling_enabled) {
     Out << "***WARNING*** CPU scaling is enabled, the benchmark "
-                 "real time measurements may be noisy and will incur extra "
-                 "overhead.\n";
+           "real time measurements may be noisy and will incur extra "
+           "overhead.\n";
   }
 
 #ifndef NDEBUG
   Out << "***WARNING*** Library was built as DEBUG. Timings may be "
-               "affected.\n";
+         "affected.\n";
 #endif
 }
 
 double BenchmarkReporter::Run::GetAdjustedRealTime() const {
   double new_time = real_accumulated_time * GetTimeUnitMultiplier(time_unit);
-  if (iterations != 0)
-    new_time /= static_cast<double>(iterations);
+  if (iterations != 0) new_time /= static_cast<double>(iterations);
   return new_time;
 }
 
 double BenchmarkReporter::Run::GetAdjustedCPUTime() const {
   double new_time = cpu_accumulated_time * GetTimeUnitMultiplier(time_unit);
-  if (iterations != 0)
-    new_time /= static_cast<double>(iterations);
+  if (iterations != 0) new_time /= static_cast<double>(iterations);
   return new_time;
 }
 
-
-
-} // end namespace benchmark
+}  // end namespace benchmark
diff --git a/src/stat.h b/src/stat.h
index c4ecfe8..136c3aa 100644
--- a/src/stat.h
+++ b/src/stat.h
@@ -6,7 +6,6 @@
 #include <ostream>
 #include <type_traits>
 
-
 namespace benchmark {
 
 template <typename VType, typename NumType>
@@ -136,7 +135,7 @@
 
  private:
   static_assert(std::is_integral<NumType>::value &&
-                !std::is_same<NumType, bool>::value,
+                    !std::is_same<NumType, bool>::value,
                 "NumType must be an integral type that is not bool.");
   // Let i be the index of the samples provided (using +=)
   // and weight[i],value[i] be the data of sample #i
diff --git a/src/string_util.cc b/src/string_util.cc
index 30d1305..4cefbfb 100644
--- a/src/string_util.cc
+++ b/src/string_util.cc
@@ -1,11 +1,11 @@
 #include "string_util.h"
 
+#include <array>
 #include <cmath>
 #include <cstdarg>
-#include <array>
+#include <cstdio>
 #include <memory>
 #include <sstream>
-#include <stdio.h>
 
 #include "arraysize.h"
 
@@ -27,7 +27,7 @@
 
 static const int64_t kUnitsSize = arraysize(kBigSIUnits);
 
-} // end anonymous namespace
+}  // end anonymous namespace
 
 void ToExponentAndMantissa(double val, double thresh, int precision,
                            double one_k, std::string* mantissa,
@@ -107,7 +107,7 @@
 void AppendHumanReadable(int n, std::string* str) {
   std::stringstream ss;
   // Round down to the nearest SI prefix.
-  ss << "/" << ToBinaryStringFullySpecified(n, 1.0, 0);
+  ss << ToBinaryStringFullySpecified(n, 1.0, 0);
   *str += ss.str();
 }
 
@@ -118,8 +118,7 @@
   return ToBinaryStringFullySpecified(n, 1.1, 1);
 }
 
-std::string StringPrintFImp(const char *msg, va_list args)
-{
+std::string StringPrintFImp(const char* msg, va_list args) {
   // we might need a second shot at this, so pre-emptivly make a copy
   va_list args_cp;
   va_copy(args_cp, args);
@@ -128,14 +127,14 @@
   // allocation guess what the size might be
   std::array<char, 256> local_buff;
   std::size_t size = local_buff.size();
-  // 2015-10-08: vsnprintf is used instead of snd::vsnprintf due to a limitation in the android-ndk
+  // 2015-10-08: vsnprintf is used instead of snd::vsnprintf due to a limitation
+  // in the android-ndk
   auto ret = vsnprintf(local_buff.data(), size, msg, args_cp);
 
   va_end(args_cp);
 
   // handle empty expansion
-  if (ret == 0)
-    return std::string{};
+  if (ret == 0) return std::string{};
   if (static_cast<std::size_t>(ret) < size)
     return std::string(local_buff.data());
 
@@ -143,13 +142,13 @@
   // add 1 to size to account for null-byte in size cast to prevent overflow
   size = static_cast<std::size_t>(ret) + 1;
   auto buff_ptr = std::unique_ptr<char[]>(new char[size]);
-  // 2015-10-08: vsnprintf is used instead of snd::vsnprintf due to a limitation in the android-ndk
+  // 2015-10-08: vsnprintf is used instead of snd::vsnprintf due to a limitation
+  // in the android-ndk
   ret = vsnprintf(buff_ptr.get(), size, msg, args);
   return std::string(buff_ptr.get());
 }
 
-std::string StringPrintF(const char* format, ...)
-{
+std::string StringPrintF(const char* format, ...) {
   va_list args;
   va_start(args, format);
   std::string tmp = StringPrintFImp(format, args);
@@ -160,10 +159,10 @@
 void ReplaceAll(std::string* str, const std::string& from,
                 const std::string& to) {
   std::size_t start = 0;
-  while((start = str->find(from, start)) != std::string::npos) {
+  while ((start = str->find(from, start)) != std::string::npos) {
     str->replace(start, from.length(), to);
     start += to.length();
   }
 }
 
-} // end namespace benchmark
+}  // end namespace benchmark
diff --git a/src/string_util.h b/src/string_util.h
index b89fef5..0b190b9 100644
--- a/src/string_util.h
+++ b/src/string_util.h
@@ -1,8 +1,8 @@
 #ifndef BENCHMARK_STRING_UTIL_H_
 #define BENCHMARK_STRING_UTIL_H_
 
-#include <string>
 #include <sstream>
+#include <string>
 #include <utility>
 #include "internal_macros.h"
 
@@ -14,23 +14,19 @@
 
 std::string StringPrintF(const char* format, ...);
 
-inline std::ostream&
-StringCatImp(std::ostream& out) BENCHMARK_NOEXCEPT
-{
+inline std::ostream& StringCatImp(std::ostream& out) BENCHMARK_NOEXCEPT {
   return out;
 }
 
-template <class First, class ...Rest>
-inline std::ostream&
-StringCatImp(std::ostream& out, First&& f, Rest&&... rest)
-{
+template <class First, class... Rest>
+inline std::ostream& StringCatImp(std::ostream& out, First&& f,
+                                  Rest&&... rest) {
   out << std::forward<First>(f);
   return StringCatImp(out, std::forward<Rest>(rest)...);
 }
 
-template<class ...Args>
-inline std::string StrCat(Args&&... args)
-{
+template <class... Args>
+inline std::string StrCat(Args&&... args) {
   std::ostringstream ss;
   StringCatImp(ss, std::forward<Args>(args)...);
   return ss.str();
@@ -39,6 +35,6 @@
 void ReplaceAll(std::string* str, const std::string& from,
                 const std::string& to);
 
-} // end namespace benchmark
+}  // end namespace benchmark
 
-#endif // BENCHMARK_STRING_UTIL_H_
+#endif  // BENCHMARK_STRING_UTIL_H_
diff --git a/src/sysinfo.cc b/src/sysinfo.cc
index 3a5d942..dd1e663 100644
--- a/src/sysinfo.cc
+++ b/src/sysinfo.cc
@@ -17,13 +17,13 @@
 
 #ifdef BENCHMARK_OS_WINDOWS
 #include <Shlwapi.h>
-#include <Windows.h>
 #include <VersionHelpers.h>
+#include <Windows.h>
 #else
 #include <fcntl.h>
 #include <sys/resource.h>
-#include <sys/types.h> // this header must be included before 'sys/sysctl.h' to avoid compilation error on FreeBSD
 #include <sys/time.h>
+#include <sys/types.h>  // this header must be included before 'sys/sysctl.h' to avoid compilation error on FreeBSD
 #include <unistd.h>
 #if defined BENCHMARK_OS_FREEBSD || defined BENCHMARK_OS_MACOSX
 #include <sys/sysctl.h>
@@ -31,8 +31,8 @@
 #endif
 
 #include <cerrno>
-#include <cstdio>
 #include <cstdint>
+#include <cstdio>
 #include <cstdlib>
 #include <cstring>
 #include <iostream>
@@ -52,7 +52,6 @@
 std::once_flag cpuinfo_init;
 double cpuinfo_cycles_per_second = 1.0;
 int cpuinfo_num_cpus = 1;  // Conservative guess
-std::mutex cputimens_mutex;
 
 #if !defined BENCHMARK_OS_MACOSX
 const int64_t estimate_time_ms = 1000;
@@ -88,6 +87,22 @@
 }
 #endif
 
+#if defined BENCHMARK_OS_LINUX || defined BENCHMARK_OS_CYGWIN
+static std::string convertToLowerCase(std::string s) {
+  for (auto& ch : s)
+    ch = std::tolower(ch);
+  return s;
+}
+static bool startsWithKey(std::string Value, std::string Key,
+                          bool IgnoreCase = true) {
+  if (IgnoreCase) {
+    Key = convertToLowerCase(std::move(Key));
+    Value = convertToLowerCase(std::move(Value));
+  }
+  return Value.compare(0, Key.size(), Key) == 0;
+}
+#endif
+
 void InitializeSystemInfo() {
 #if defined BENCHMARK_OS_LINUX || defined BENCHMARK_OS_CYGWIN
   char line[1024];
@@ -127,7 +142,8 @@
   if (fd == -1) {
     perror(pname);
     if (!saw_mhz) {
-      cpuinfo_cycles_per_second = static_cast<double>(EstimateCyclesPerSecond());
+      cpuinfo_cycles_per_second =
+          static_cast<double>(EstimateCyclesPerSecond());
     }
     return;
   }
@@ -160,21 +176,21 @@
     // When parsing the "cpu MHz" and "bogomips" (fallback) entries, we only
     // accept postive values. Some environments (virtual machines) report zero,
     // which would cause infinite looping in WallTime_Init.
-    if (!saw_mhz && strncasecmp(line, "cpu MHz", sizeof("cpu MHz") - 1) == 0) {
+    if (!saw_mhz && startsWithKey(line, "cpu MHz")) {
       const char* freqstr = strchr(line, ':');
       if (freqstr) {
         cpuinfo_cycles_per_second = strtod(freqstr + 1, &err) * 1000000.0;
         if (freqstr[1] != '\0' && *err == '\0' && cpuinfo_cycles_per_second > 0)
           saw_mhz = true;
       }
-    } else if (strncasecmp(line, "bogomips", sizeof("bogomips") - 1) == 0) {
+    } else if (startsWithKey(line, "bogomips")) {
       const char* freqstr = strchr(line, ':');
       if (freqstr) {
         bogo_clock = strtod(freqstr + 1, &err) * 1000000.0;
         if (freqstr[1] != '\0' && *err == '\0' && bogo_clock > 0)
           saw_bogo = true;
       }
-    } else if (strncmp(line, "processor", sizeof("processor") - 1) == 0) {
+    } else if (startsWithKey(line, "processor", /*IgnoreCase*/false)) {
       // The above comparison is case-sensitive because ARM kernels often
       // include a "Processor" line that tells you about the CPU, distinct
       // from the usual "processor" lines that give you CPU ids. No current
@@ -197,7 +213,8 @@
       cpuinfo_cycles_per_second = bogo_clock;
     } else {
       // If we don't even have bogomips, we'll use the slow estimation.
-      cpuinfo_cycles_per_second = static_cast<double>(EstimateCyclesPerSecond());
+      cpuinfo_cycles_per_second =
+          static_cast<double>(EstimateCyclesPerSecond());
     }
   }
   if (num_cpus == 0) {
@@ -239,7 +256,6 @@
   }
 // TODO: also figure out cpuinfo_num_cpus
 
-
 #elif defined BENCHMARK_OS_WINDOWS
   // In NT, read MHz from the registry. If we fail to do so or we're in win9x
   // then make a crude estimate.
@@ -249,139 +265,47 @@
           SHGetValueA(HKEY_LOCAL_MACHINE,
                       "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0",
                       "~MHz", nullptr, &data, &data_size)))
-    cpuinfo_cycles_per_second = static_cast<double>((int64_t)data * (int64_t)(1000 * 1000));  // was mhz
+    cpuinfo_cycles_per_second =
+        static_cast<double>((int64_t)data * (int64_t)(1000 * 1000));  // was mhz
   else
     cpuinfo_cycles_per_second = static_cast<double>(EstimateCyclesPerSecond());
 
-  SYSTEM_INFO sysinfo = { 0 };
+  SYSTEM_INFO sysinfo;
+  // Use memset as opposed to = {} to avoid GCC missing initializer false
+  // positives.
+  std::memset(&sysinfo, 0, sizeof(SYSTEM_INFO));
   GetSystemInfo(&sysinfo);
-  cpuinfo_num_cpus = sysinfo.dwNumberOfProcessors; // number of logical processors in the current group
+  cpuinfo_num_cpus = sysinfo.dwNumberOfProcessors;  // number of logical
+                                                    // processors in the current
+                                                    // group
 
 #elif defined BENCHMARK_OS_MACOSX
-  // returning "mach time units" per second. the current number of elapsed
-  // mach time units can be found by calling uint64 mach_absolute_time();
-  // while not as precise as actual CPU cycles, it is accurate in the face
-  // of CPU frequency scaling and multi-cpu/core machines.
-  // Our mac users have these types of machines, and accuracy
-  // (i.e. correctness) trumps precision.
-  // See cycleclock.h: CycleClock::Now(), which returns number of mach time
-  // units on Mac OS X.
-  mach_timebase_info_data_t timebase_info;
-  mach_timebase_info(&timebase_info);
-  double mach_time_units_per_nanosecond =
-      static_cast<double>(timebase_info.denom) /
-      static_cast<double>(timebase_info.numer);
-  cpuinfo_cycles_per_second = mach_time_units_per_nanosecond * 1e9;
-
-  int num_cpus = 0;
+  int32_t num_cpus = 0;
   size_t size = sizeof(num_cpus);
-  int numcpus_name[] = {CTL_HW, HW_NCPU};
-  if (::sysctl(numcpus_name, arraysize(numcpus_name), &num_cpus, &size, nullptr, 0) ==
-          0 &&
-      (size == sizeof(num_cpus)))
+  if (::sysctlbyname("hw.ncpu", &num_cpus, &size, nullptr, 0) == 0 &&
+      (size == sizeof(num_cpus))) {
     cpuinfo_num_cpus = num_cpus;
-
+  } else {
+    fprintf(stderr, "%s\n", strerror(errno));
+    std::exit(EXIT_FAILURE);
+  }
+  int64_t cpu_freq = 0;
+  size = sizeof(cpu_freq);
+  if (::sysctlbyname("hw.cpufrequency", &cpu_freq, &size, nullptr, 0) == 0 &&
+      (size == sizeof(cpu_freq))) {
+    cpuinfo_cycles_per_second = cpu_freq;
+  } else {
+    fprintf(stderr, "%s\n", strerror(errno));
+    std::exit(EXIT_FAILURE);
+  }
 #else
   // Generic cycles per second counter
   cpuinfo_cycles_per_second = static_cast<double>(EstimateCyclesPerSecond());
 #endif
 }
+
 }  // end namespace
 
-// getrusage() based implementation of MyCPUUsage
-static double MyCPUUsageRUsage() {
-#ifndef BENCHMARK_OS_WINDOWS
-  struct rusage ru;
-  if (getrusage(RUSAGE_SELF, &ru) == 0) {
-    return (static_cast<double>(ru.ru_utime.tv_sec) +
-            static_cast<double>(ru.ru_utime.tv_usec) * 1e-6 +
-            static_cast<double>(ru.ru_stime.tv_sec) +
-            static_cast<double>(ru.ru_stime.tv_usec) * 1e-6);
-  } else {
-    return 0.0;
-  }
-#else
-  HANDLE proc = GetCurrentProcess();
-  FILETIME creation_time;
-  FILETIME exit_time;
-  FILETIME kernel_time;
-  FILETIME user_time;
-  ULARGE_INTEGER kernel;
-  ULARGE_INTEGER user;
-  GetProcessTimes(proc, &creation_time, &exit_time, &kernel_time, &user_time);
-  kernel.HighPart = kernel_time.dwHighDateTime;
-  kernel.LowPart = kernel_time.dwLowDateTime;
-  user.HighPart = user_time.dwHighDateTime;
-  user.LowPart = user_time.dwLowDateTime;
-  return (static_cast<double>(kernel.QuadPart) +
-          static_cast<double>(user.QuadPart)) * 1e-7;
-#endif  // OS_WINDOWS
-}
-
-#ifndef BENCHMARK_OS_WINDOWS
-static bool MyCPUUsageCPUTimeNsLocked(double* cputime) {
-  static int cputime_fd = -1;
-  if (cputime_fd == -1) {
-    cputime_fd = open("/proc/self/cputime_ns", O_RDONLY);
-    if (cputime_fd < 0) {
-      cputime_fd = -1;
-      return false;
-    }
-  }
-  char buff[64];
-  memset(buff, 0, sizeof(buff));
-  if (pread(cputime_fd, buff, sizeof(buff) - 1, 0) <= 0) {
-    close(cputime_fd);
-    cputime_fd = -1;
-    return false;
-  }
-  unsigned long long result = strtoull(buff, nullptr, 0);
-  if (result == (std::numeric_limits<unsigned long long>::max)()) {
-    close(cputime_fd);
-    cputime_fd = -1;
-    return false;
-  }
-  *cputime = static_cast<double>(result) / 1e9;
-  return true;
-}
-#endif  // OS_WINDOWS
-
-double MyCPUUsage() {
-#ifndef BENCHMARK_OS_WINDOWS
-  {
-    std::lock_guard<std::mutex> l(cputimens_mutex);
-    static bool use_cputime_ns = true;
-    if (use_cputime_ns) {
-      double value;
-      if (MyCPUUsageCPUTimeNsLocked(&value)) {
-        return value;
-      }
-      // Once MyCPUUsageCPUTimeNsLocked fails once fall back to getrusage().
-      VLOG(1) << "Reading /proc/self/cputime_ns failed. Using getrusage().\n";
-      use_cputime_ns = false;
-    }
-  }
-#endif  // OS_WINDOWS
-  return MyCPUUsageRUsage();
-}
-
-double ChildrenCPUUsage() {
-#ifndef BENCHMARK_OS_WINDOWS
-  struct rusage ru;
-  if (getrusage(RUSAGE_CHILDREN, &ru) == 0) {
-    return (static_cast<double>(ru.ru_utime.tv_sec) +
-            static_cast<double>(ru.ru_utime.tv_usec) * 1e-6 +
-            static_cast<double>(ru.ru_stime.tv_sec) +
-            static_cast<double>(ru.ru_stime.tv_usec) * 1e-6);
-  } else {
-    return 0.0;
-  }
-#else
-  // TODO: Not sure what this even means on Windows
-  return 0.0;
-#endif  // OS_WINDOWS
-}
-
 double CyclesPerSecond(void) {
   std::call_once(cpuinfo_init, InitializeSystemInfo);
   return cpuinfo_cycles_per_second;
@@ -408,8 +332,8 @@
   // local file system. If reading the exported files fails, then we may not be
   // running on Linux, so we silently ignore all the read errors.
   for (int cpu = 0, num_cpus = NumCPUs(); cpu < num_cpus; ++cpu) {
-    std::string governor_file = StrCat("/sys/devices/system/cpu/cpu", cpu,
-                                       "/cpufreq/scaling_governor");
+    std::string governor_file =
+        StrCat("/sys/devices/system/cpu/cpu", cpu, "/cpufreq/scaling_governor");
     FILE* file = fopen(governor_file.c_str(), "r");
     if (!file) break;
     char buff[16];
diff --git a/src/sysinfo.h b/src/sysinfo.h
index eaf77e0..c5d9916 100644
--- a/src/sysinfo.h
+++ b/src/sysinfo.h
@@ -2,8 +2,6 @@
 #define BENCHMARK_SYSINFO_H_
 
 namespace benchmark {
-double MyCPUUsage();
-double ChildrenCPUUsage();
 int NumCPUs();
 double CyclesPerSecond();
 bool CpuScalingEnabled();
diff --git a/src/timers.cc b/src/timers.cc
new file mode 100644
index 0000000..fadc08f
--- /dev/null
+++ b/src/timers.cc
@@ -0,0 +1,195 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "timers.h"
+#include "internal_macros.h"
+
+#ifdef BENCHMARK_OS_WINDOWS
+#include <Shlwapi.h>
+#include <VersionHelpers.h>
+#include <Windows.h>
+#else
+#include <fcntl.h>
+#include <sys/resource.h>
+#include <sys/time.h>
+#include <sys/types.h>  // this header must be included before 'sys/sysctl.h' to avoid compilation error on FreeBSD
+#include <unistd.h>
+#if defined BENCHMARK_OS_FREEBSD || defined BENCHMARK_OS_MACOSX
+#include <sys/sysctl.h>
+#endif
+#if defined(BENCHMARK_OS_MACOSX)
+#include <mach/mach_init.h>
+#include <mach/mach_port.h>
+#include <mach/thread_act.h>
+#endif
+#endif
+
+#include <cerrno>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <ctime>
+#include <iostream>
+#include <limits>
+#include <mutex>
+
+#include "check.h"
+#include "log.h"
+#include "sleep.h"
+#include "string_util.h"
+
+namespace benchmark {
+
+// Suppress unused warnings on helper functions.
+#if defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+
+namespace {
+#if defined(BENCHMARK_OS_WINDOWS)
+double MakeTime(FILETIME const& kernel_time, FILETIME const& user_time) {
+  ULARGE_INTEGER kernel;
+  ULARGE_INTEGER user;
+  kernel.HighPart = kernel_time.dwHighDateTime;
+  kernel.LowPart = kernel_time.dwLowDateTime;
+  user.HighPart = user_time.dwHighDateTime;
+  user.LowPart = user_time.dwLowDateTime;
+  return (static_cast<double>(kernel.QuadPart) +
+          static_cast<double>(user.QuadPart)) *
+         1e-7;
+}
+#else
+double MakeTime(struct rusage const& ru) {
+  return (static_cast<double>(ru.ru_utime.tv_sec) +
+          static_cast<double>(ru.ru_utime.tv_usec) * 1e-6 +
+          static_cast<double>(ru.ru_stime.tv_sec) +
+          static_cast<double>(ru.ru_stime.tv_usec) * 1e-6);
+}
+#endif
+#if defined(BENCHMARK_OS_MACOSX)
+double MakeTime(thread_basic_info_data_t const& info) {
+  return (static_cast<double>(info.user_time.seconds) +
+          static_cast<double>(info.user_time.microseconds) * 1e-6 +
+          static_cast<double>(info.system_time.seconds) +
+          static_cast<double>(info.system_time.microseconds) * 1e-6);
+}
+#endif
+#if defined(CLOCK_PROCESS_CPUTIME_ID) || defined(CLOCK_THREAD_CPUTIME_ID)
+double MakeTime(struct timespec const& ts) {
+  return ts.tv_sec + (static_cast<double>(ts.tv_nsec) * 1e-9);
+}
+#endif
+
+BENCHMARK_NORETURN static void DiagnoseAndExit(const char* msg) {
+  std::cerr << "ERROR: " << msg << std::endl;
+  std::exit(EXIT_FAILURE);
+}
+
+}  // end namespace
+
+double ProcessCPUUsage() {
+// FIXME We want to use clock_gettime, but its not available in MacOS 10.11. See
+// https://github.com/google/benchmark/pull/292
+#if defined(CLOCK_PROCESS_CPUTIME_ID) && !defined(BENCHMARK_OS_MACOSX)
+  struct timespec spec;
+  if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &spec) == 0)
+    return MakeTime(spec);
+  DiagnoseAndExit("clock_gettime(CLOCK_PROCESS_CPUTIME_ID, ...) failed");
+#elif defined(BENCHMARK_OS_WINDOWS)
+  HANDLE proc = GetCurrentProcess();
+  FILETIME creation_time;
+  FILETIME exit_time;
+  FILETIME kernel_time;
+  FILETIME user_time;
+  if (GetProcessTimes(proc, &creation_time, &exit_time, &kernel_time,
+                      &user_time))
+    return MakeTime(kernel_time, user_time);
+  DiagnoseAndExit("GetProccessTimes() failed");
+#else
+  struct rusage ru;
+  if (getrusage(RUSAGE_SELF, &ru) == 0) return MakeTime(ru);
+  DiagnoseAndExit("clock_gettime(CLOCK_PROCESS_CPUTIME_ID, ...) failed");
+#endif
+}
+
+double ThreadCPUUsage() {
+// FIXME We want to use clock_gettime, but its not available in MacOS 10.11. See
+// https://github.com/google/benchmark/pull/292
+#if defined(CLOCK_THREAD_CPUTIME_ID) && !defined(BENCHMARK_OS_MACOSX)
+  struct timespec ts;
+  if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts) == 0) return MakeTime(ts);
+  DiagnoseAndExit("clock_gettime(CLOCK_THREAD_CPUTIME_ID, ...) failed");
+#elif defined(BENCHMARK_OS_WINDOWS)
+  HANDLE this_thread = GetCurrentThread();
+  FILETIME creation_time;
+  FILETIME exit_time;
+  FILETIME kernel_time;
+  FILETIME user_time;
+  GetThreadTimes(this_thread, &creation_time, &exit_time, &kernel_time,
+                 &user_time);
+  return MakeTime(kernel_time, user_time);
+#elif defined(BENCHMARK_OS_MACOSX)
+  mach_msg_type_number_t count = THREAD_BASIC_INFO_COUNT;
+  thread_basic_info_data_t info;
+  mach_port_t thread = pthread_mach_thread_np(pthread_self());
+  if (thread_info(thread, THREAD_BASIC_INFO, (thread_info_t)&info, &count) ==
+      KERN_SUCCESS) {
+    return MakeTime(info);
+  }
+  DiagnoseAndExit("ThreadCPUUsage() failed when evaluating thread_info");
+#else
+#error Per-thread timing is not available on your system.
+#endif
+}
+
+namespace {
+
+std::string DateTimeString(bool local) {
+  typedef std::chrono::system_clock Clock;
+  std::time_t now = Clock::to_time_t(Clock::now());
+  const std::size_t kStorageSize = 128;
+  char storage[kStorageSize];
+  std::size_t written;
+
+  if (local) {
+#if defined(BENCHMARK_OS_WINDOWS)
+    written =
+        std::strftime(storage, sizeof(storage), "%x %X", ::localtime(&now));
+#else
+    std::tm timeinfo;
+    std::memset(&timeinfo, 0, sizeof(std::tm));
+    ::localtime_r(&now, &timeinfo);
+    written = std::strftime(storage, sizeof(storage), "%F %T", &timeinfo);
+#endif
+  } else {
+#if defined(BENCHMARK_OS_WINDOWS)
+    written = std::strftime(storage, sizeof(storage), "%x %X", ::gmtime(&now));
+#else
+    std::tm timeinfo;
+    std::memset(&timeinfo, 0, sizeof(std::tm));
+    ::gmtime_r(&now, &timeinfo);
+    written = std::strftime(storage, sizeof(storage), "%F %T", &timeinfo);
+#endif
+  }
+  CHECK(written < kStorageSize);
+  ((void)written);  // prevent unused variable in optimized mode.
+  return std::string(storage);
+}
+
+}  // end namespace
+
+std::string LocalDateTimeString() { return DateTimeString(true); }
+
+}  // end namespace benchmark
diff --git a/src/timers.h b/src/timers.h
new file mode 100644
index 0000000..65606cc
--- /dev/null
+++ b/src/timers.h
@@ -0,0 +1,48 @@
+#ifndef BENCHMARK_TIMERS_H
+#define BENCHMARK_TIMERS_H
+
+#include <chrono>
+#include <string>
+
+namespace benchmark {
+
+// Return the CPU usage of the current process
+double ProcessCPUUsage();
+
+// Return the CPU usage of the children of the current process
+double ChildrenCPUUsage();
+
+// Return the CPU usage of the current thread
+double ThreadCPUUsage();
+
+#if defined(HAVE_STEADY_CLOCK)
+template <bool HighResIsSteady = std::chrono::high_resolution_clock::is_steady>
+struct ChooseSteadyClock {
+  typedef std::chrono::high_resolution_clock type;
+};
+
+template <>
+struct ChooseSteadyClock<false> {
+  typedef std::chrono::steady_clock type;
+};
+#endif
+
+struct ChooseClockType {
+#if defined(HAVE_STEADY_CLOCK)
+  typedef ChooseSteadyClock<>::type type;
+#else
+  typedef std::chrono::high_resolution_clock type;
+#endif
+};
+
+inline double ChronoClockNow() {
+  typedef ChooseClockType::type ClockType;
+  using FpSeconds = std::chrono::duration<double, std::chrono::seconds::period>;
+  return FpSeconds(ClockType::now().time_since_epoch()).count();
+}
+
+std::string LocalDateTimeString();
+
+}  // end namespace benchmark
+
+#endif  // BENCHMARK_TIMERS_H
diff --git a/src/walltime.cc b/src/walltime.cc
deleted file mode 100644
index 4bdbaa5..0000000
--- a/src/walltime.cc
+++ /dev/null
@@ -1,263 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "benchmark/macros.h"
-#include "internal_macros.h"
-#include "walltime.h"
-
-#if defined(BENCHMARK_OS_WINDOWS)
-#include <time.h>
-#include <winsock.h> // for timeval
-#else
-#include <sys/time.h>
-#endif
-
-#include <cstdio>
-#include <cstdint>
-#include <cstring>
-#include <ctime>
-
-#include <atomic>
-#include <chrono>
-#include <limits>
-
-#include "arraysize.h"
-#include "check.h"
-#include "cycleclock.h"
-#include "log.h"
-#include "sysinfo.h"
-
-namespace benchmark {
-namespace walltime {
-
-namespace {
-
-#if defined(HAVE_STEADY_CLOCK)
-template <bool HighResIsSteady = std::chrono::high_resolution_clock::is_steady>
-struct ChooseSteadyClock {
-    typedef std::chrono::high_resolution_clock type;
-};
-
-template <>
-struct ChooseSteadyClock<false> {
-    typedef std::chrono::steady_clock type;
-};
-#endif
-
-struct ChooseClockType {
-#if defined(HAVE_STEADY_CLOCK)
-  typedef ChooseSteadyClock<>::type type;
-#else
-  typedef std::chrono::high_resolution_clock type;
-#endif
-};
-
-class WallTimeImp
-{
-public:
-  WallTime Now();
-
-  static WallTimeImp& GetWallTimeImp() {
-    static WallTimeImp* imp = new WallTimeImp();
-    return *imp;
-  }
-
-private:
-  WallTimeImp();
-  // Helper routines to load/store a float from an AtomicWord. Required because
-  // g++ < 4.7 doesn't support std::atomic<float> correctly. I cannot wait to
-  // get rid of this horror show.
-  void SetDrift(float f) {
-    int32_t w;
-    memcpy(&w, &f, sizeof(f));
-    std::atomic_store(&drift_adjust_, w);
-  }
-
-  float GetDrift() const {
-    float f;
-    int32_t w = std::atomic_load(&drift_adjust_);
-    memcpy(&f, &w, sizeof(f));
-    return f;
-  }
-
-  WallTime Slow() const {
-    struct timeval tv;
-#if defined(BENCHMARK_OS_WINDOWS)
-    FILETIME    file_time;
-    SYSTEMTIME  system_time;
-    ULARGE_INTEGER ularge;
-    const unsigned __int64 epoch = 116444736000000000LL;
-
-    GetSystemTime(&system_time);
-    SystemTimeToFileTime(&system_time, &file_time);
-    ularge.LowPart = file_time.dwLowDateTime;
-    ularge.HighPart = file_time.dwHighDateTime;
-
-    tv.tv_sec = (long)((ularge.QuadPart - epoch) / (10L * 1000 * 1000));
-    tv.tv_usec = (long)(system_time.wMilliseconds * 1000);
-#else
-    gettimeofday(&tv, nullptr);
-#endif
-    return tv.tv_sec + tv.tv_usec * 1e-6;
-  }
-
-private:
-  static_assert(sizeof(float) <= sizeof(int32_t),
-               "type sizes don't allow the drift_adjust hack");
-
-  WallTime base_walltime_;
-  int64_t base_cycletime_;
-  int64_t cycles_per_second_;
-  double seconds_per_cycle_;
-  uint32_t last_adjust_time_;
-  std::atomic<int32_t> drift_adjust_;
-  int64_t max_interval_cycles_;
-
-  BENCHMARK_DISALLOW_COPY_AND_ASSIGN(WallTimeImp);
-};
-
-
-WallTime WallTimeImp::Now() {
-  WallTime now = 0.0;
-  WallTime result = 0.0;
-  int64_t ct = 0;
-  uint32_t top_bits = 0;
-  do {
-    ct = cycleclock::Now();
-    int64_t cycle_delta = ct - base_cycletime_;
-    result = base_walltime_ + cycle_delta * seconds_per_cycle_;
-
-    top_bits = static_cast<uint32_t>(uint64_t(ct) >> 32);
-    // Recompute drift no more often than every 2^32 cycles.
-    // I.e., @2GHz, ~ every two seconds
-    if (top_bits == last_adjust_time_) {  // don't need to recompute drift
-      return result + GetDrift();
-    }
-
-    now = Slow();
-  } while (cycleclock::Now() - ct > max_interval_cycles_);
-  // We are now sure that "now" and "result" were produced within
-  // kMaxErrorInterval of one another.
-
-  SetDrift(static_cast<float>(now - result));
-  last_adjust_time_ = top_bits;
-  return now;
-}
-
-
-WallTimeImp::WallTimeImp()
-    : base_walltime_(0.0), base_cycletime_(0),
-      cycles_per_second_(0), seconds_per_cycle_(0.0),
-      last_adjust_time_(0), drift_adjust_(0),
-      max_interval_cycles_(0) {
-  const double kMaxErrorInterval = 100e-6;
-  cycles_per_second_ = static_cast<int64_t>(CyclesPerSecond());
-  CHECK(cycles_per_second_ != 0);
-  seconds_per_cycle_ = 1.0 / cycles_per_second_;
-  max_interval_cycles_ =
-      static_cast<int64_t>(cycles_per_second_ * kMaxErrorInterval);
-  do {
-    base_cycletime_ = cycleclock::Now();
-    base_walltime_ = Slow();
-  } while (cycleclock::Now() - base_cycletime_ > max_interval_cycles_);
-  // We are now sure that "base_walltime" and "base_cycletime" were produced
-  // within kMaxErrorInterval of one another.
-
-  SetDrift(0.0);
-  last_adjust_time_ = static_cast<uint32_t>(uint64_t(base_cycletime_) >> 32);
-}
-
-WallTime CPUWalltimeNow() {
-  static WallTimeImp& imp = WallTimeImp::GetWallTimeImp();
-  return imp.Now();
-}
-
-WallTime ChronoWalltimeNow() {
-  typedef ChooseClockType::type Clock;
-  typedef std::chrono::duration<WallTime, std::chrono::seconds::period>
-          FPSeconds;
-  static_assert(std::chrono::treat_as_floating_point<WallTime>::value,
-                "This type must be treated as a floating point type.");
-  auto now = Clock::now().time_since_epoch();
-  return std::chrono::duration_cast<FPSeconds>(now).count();
-}
-
-bool UseCpuCycleClock() {
-    bool useWallTime = !CpuScalingEnabled();
-    if (useWallTime) {
-        VLOG(1) << "Using the CPU cycle clock to provide walltime::Now().\n";
-    } else {
-        VLOG(1) << "Using std::chrono to provide walltime::Now().\n";
-    }
-    return useWallTime;
-}
-
-
-} // end anonymous namespace
-
-// WallTimeImp doesn't work when CPU Scaling is enabled. If CPU Scaling is
-// enabled at the start of the program then std::chrono::system_clock is used
-// instead.
-WallTime Now()
-{
-  static bool useCPUClock = UseCpuCycleClock();
-  if (useCPUClock) {
-    return CPUWalltimeNow();
-  } else {
-    return ChronoWalltimeNow();
-  }
-}
-
-}  // end namespace walltime
-
-
-namespace {
-
-std::string DateTimeString(bool local) {
-  typedef std::chrono::system_clock Clock;
-  std::time_t now = Clock::to_time_t(Clock::now());
-  char storage[128];
-  std::size_t written;
-
-  if (local) {
-#if defined(BENCHMARK_OS_WINDOWS)
-    written = std::strftime(storage, sizeof(storage), "%x %X", ::localtime(&now));
-#else
-    std::tm timeinfo;
-    std::memset(&timeinfo, 0, sizeof(std::tm));
-    ::localtime_r(&now, &timeinfo);
-    written = std::strftime(storage, sizeof(storage), "%F %T", &timeinfo);
-#endif
-  } else {
-#if defined(BENCHMARK_OS_WINDOWS)
-    written = std::strftime(storage, sizeof(storage), "%x %X", ::gmtime(&now));
-#else
-    std::tm timeinfo;
-    std::memset(&timeinfo, 0, sizeof(std::tm));
-    ::gmtime_r(&now, &timeinfo);
-    written = std::strftime(storage, sizeof(storage), "%F %T", &timeinfo);
-#endif
-  }
-  CHECK(written < arraysize(storage));
-  ((void)written); // prevent unused variable in optimized mode.
-  return std::string(storage);
-}
-
-} // end namespace
-
-std::string LocalDateTimeString() {
-  return DateTimeString(true);
-}
-
-}  // end namespace benchmark
diff --git a/src/walltime.h b/src/walltime.h
deleted file mode 100644
index 38c26f3..0000000
--- a/src/walltime.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef BENCHMARK_WALLTIME_H_
-#define BENCHMARK_WALLTIME_H_
-
-#include <string>
-
-namespace benchmark {
-typedef double WallTime;
-
-namespace walltime {
-WallTime Now();
-}  // end namespace walltime
-
-std::string LocalDateTimeString();
-
-}  // end namespace benchmark
-
-#endif  // BENCHMARK_WALLTIME_H_
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index aeb720a..8724598 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -2,11 +2,27 @@
 
 find_package(Threads REQUIRED)
 
+# NOTE: These flags must be added after find_package(Threads REQUIRED) otherwise
+# they will break the configuration check.
+if (DEFINED BENCHMARK_CXX_LINKER_FLAGS)
+  list(APPEND CMAKE_EXE_LINKER_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS})
+endif()
+
+add_library(output_test_helper STATIC output_test_helper.cc)
+
 macro(compile_benchmark_test name)
   add_executable(${name} "${name}.cc")
   target_link_libraries(${name} benchmark ${CMAKE_THREAD_LIBS_INIT})
 endmacro(compile_benchmark_test)
 
+
+macro(compile_output_test name)
+  add_executable(${name} "${name}.cc" output_test.h)
+  target_link_libraries(${name} output_test_helper benchmark
+          ${BENCHMARK_CXX_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
+endmacro(compile_output_test)
+
+
 # Demonstration executable
 compile_benchmark_test(benchmark_test)
 add_test(benchmark benchmark_test --benchmark_min_time=0.01)
@@ -45,10 +61,16 @@
 compile_benchmark_test(fixture_test)
 add_test(fixture_test fixture_test --benchmark_min_time=0.01)
 
+compile_benchmark_test(register_benchmark_test)
+add_test(register_benchmark_test register_benchmark_test --benchmark_min_time=0.01)
+
 compile_benchmark_test(map_test)
 add_test(map_test map_test --benchmark_min_time=0.01)
 
-compile_benchmark_test(reporter_output_test)
+compile_benchmark_test(multiple_ranges_test)
+add_test(multiple_ranges_test multiple_ranges_test --benchmark_min_time=0.01)
+
+compile_output_test(reporter_output_test)
 add_test(reporter_output_test reporter_output_test --benchmark_min_time=0.01)
 
 check_cxx_compiler_flag(-std=c++03 BENCHMARK_HAS_CXX03_FLAG)
@@ -63,8 +85,14 @@
   add_test(cxx03 cxx03_test --benchmark_min_time=0.01)
 endif()
 
-compile_benchmark_test(complexity_test)
-add_test(complexity_benchmark complexity_test --benchmark_min_time=0.01)
+# Attempt to work around flaky test failures when running on Appveyor servers.
+if (DEFINED ENV{APPVEYOR})
+  set(COMPLEXITY_MIN_TIME "0.5")
+else()
+  set(COMPLEXITY_MIN_TIME "0.01")
+endif()
+compile_output_test(complexity_test)
+add_test(complexity_benchmark complexity_test --benchmark_min_time=${COMPLEXITY_MIN_TIME})
 
 # Add the coverage command(s)
 if(CMAKE_BUILD_TYPE)
diff --git a/test/basic_test.cc b/test/basic_test.cc
index 3435415..22de007 100644
--- a/test/basic_test.cc
+++ b/test/basic_test.cc
@@ -1,8 +1,7 @@
 
 #include "benchmark/benchmark_api.h"
 
-#define BASIC_BENCHMARK_TEST(x) \
-    BENCHMARK(x)->Arg(8)->Arg(512)->Arg(8192)
+#define BASIC_BENCHMARK_TEST(x) BENCHMARK(x)->Arg(8)->Arg(512)->Arg(8192)
 
 void BM_empty(benchmark::State& state) {
   while (state.KeepRunning()) {
@@ -14,7 +13,7 @@
 
 void BM_spin_empty(benchmark::State& state) {
   while (state.KeepRunning()) {
-    for (int x = 0; x < state.range_x(); ++x) {
+    for (int x = 0; x < state.range(0); ++x) {
       benchmark::DoNotOptimize(x);
     }
   }
@@ -23,11 +22,11 @@
 BASIC_BENCHMARK_TEST(BM_spin_empty)->ThreadPerCpu();
 
 void BM_spin_pause_before(benchmark::State& state) {
-  for (int i = 0; i < state.range_x(); ++i) {
+  for (int i = 0; i < state.range(0); ++i) {
     benchmark::DoNotOptimize(i);
   }
-  while(state.KeepRunning()) {
-    for (int i = 0; i < state.range_x(); ++i) {
+  while (state.KeepRunning()) {
+    for (int i = 0; i < state.range(0); ++i) {
       benchmark::DoNotOptimize(i);
     }
   }
@@ -35,15 +34,14 @@
 BASIC_BENCHMARK_TEST(BM_spin_pause_before);
 BASIC_BENCHMARK_TEST(BM_spin_pause_before)->ThreadPerCpu();
 
-
 void BM_spin_pause_during(benchmark::State& state) {
-  while(state.KeepRunning()) {
+  while (state.KeepRunning()) {
     state.PauseTiming();
-    for (int i = 0; i < state.range_x(); ++i) {
+    for (int i = 0; i < state.range(0); ++i) {
       benchmark::DoNotOptimize(i);
     }
     state.ResumeTiming();
-    for (int i = 0; i < state.range_x(); ++i) {
+    for (int i = 0; i < state.range(0); ++i) {
       benchmark::DoNotOptimize(i);
     }
   }
@@ -52,7 +50,7 @@
 BASIC_BENCHMARK_TEST(BM_spin_pause_during)->ThreadPerCpu();
 
 void BM_pause_during(benchmark::State& state) {
-  while(state.KeepRunning()) {
+  while (state.KeepRunning()) {
     state.PauseTiming();
     state.ResumeTiming();
   }
@@ -63,38 +61,37 @@
 BENCHMARK(BM_pause_during)->UseRealTime()->ThreadPerCpu();
 
 void BM_spin_pause_after(benchmark::State& state) {
-  while(state.KeepRunning()) {
-    for (int i = 0; i < state.range_x(); ++i) {
+  while (state.KeepRunning()) {
+    for (int i = 0; i < state.range(0); ++i) {
       benchmark::DoNotOptimize(i);
     }
   }
-  for (int i = 0; i < state.range_x(); ++i) {
+  for (int i = 0; i < state.range(0); ++i) {
     benchmark::DoNotOptimize(i);
   }
 }
 BASIC_BENCHMARK_TEST(BM_spin_pause_after);
 BASIC_BENCHMARK_TEST(BM_spin_pause_after)->ThreadPerCpu();
 
-
 void BM_spin_pause_before_and_after(benchmark::State& state) {
-  for (int i = 0; i < state.range_x(); ++i) {
+  for (int i = 0; i < state.range(0); ++i) {
     benchmark::DoNotOptimize(i);
   }
-  while(state.KeepRunning()) {
-    for (int i = 0; i < state.range_x(); ++i) {
+  while (state.KeepRunning()) {
+    for (int i = 0; i < state.range(0); ++i) {
       benchmark::DoNotOptimize(i);
     }
   }
-  for (int i = 0; i < state.range_x(); ++i) {
+  for (int i = 0; i < state.range(0); ++i) {
     benchmark::DoNotOptimize(i);
   }
 }
 BASIC_BENCHMARK_TEST(BM_spin_pause_before_and_after);
 BASIC_BENCHMARK_TEST(BM_spin_pause_before_and_after)->ThreadPerCpu();
 
-
 void BM_empty_stop_start(benchmark::State& state) {
-  while (state.KeepRunning()) { }
+  while (state.KeepRunning()) {
+  }
 }
 BENCHMARK(BM_empty_stop_start);
 BENCHMARK(BM_empty_stop_start)->ThreadPerCpu();
diff --git a/test/benchmark_test.cc b/test/benchmark_test.cc
index 66f5956..d832f81 100644
--- a/test/benchmark_test.cc
+++ b/test/benchmark_test.cc
@@ -4,6 +4,7 @@
 #include <math.h>
 #include <stdint.h>
 
+#include <chrono>
 #include <cstdlib>
 #include <iostream>
 #include <limits>
@@ -13,15 +14,14 @@
 #include <set>
 #include <sstream>
 #include <string>
-#include <vector>
-#include <chrono>
 #include <thread>
 #include <utility>
+#include <vector>
 
 #if defined(__GNUC__)
-# define BENCHMARK_NOINLINE __attribute__((noinline))
+#define BENCHMARK_NOINLINE __attribute__((noinline))
 #else
-# define BENCHMARK_NOINLINE
+#define BENCHMARK_NOINLINE
 #endif
 
 namespace {
@@ -42,8 +42,7 @@
 
 std::set<int> ConstructRandomSet(int size) {
   std::set<int> s;
-  for (int i = 0; i < size; ++i)
-    s.insert(i);
+  for (int i = 0; i < size; ++i) s.insert(i);
   return s;
 }
 
@@ -54,8 +53,7 @@
 
 static void BM_Factorial(benchmark::State& state) {
   int fac_42 = 0;
-  while (state.KeepRunning())
-    fac_42 = Factorial(8);
+  while (state.KeepRunning()) fac_42 = Factorial(8);
   // Prevent compiler optimizations
   std::stringstream ss;
   ss << fac_42;
@@ -66,8 +64,7 @@
 
 static void BM_CalculatePiRange(benchmark::State& state) {
   double pi = 0.0;
-  while (state.KeepRunning())
-    pi = CalculatePi(state.range_x());
+  while (state.KeepRunning()) pi = CalculatePi(state.range(0));
   std::stringstream ss;
   ss << pi;
   state.SetLabel(ss.str());
@@ -87,29 +84,29 @@
 static void BM_SetInsert(benchmark::State& state) {
   while (state.KeepRunning()) {
     state.PauseTiming();
-    std::set<int> data = ConstructRandomSet(state.range_x());
+    std::set<int> data = ConstructRandomSet(state.range(0));
     state.ResumeTiming();
-    for (int j = 0; j < state.range_y(); ++j)
-      data.insert(rand());
+    for (int j = 0; j < state.range(1); ++j) data.insert(rand());
   }
-  state.SetItemsProcessed(state.iterations() * state.range_y());
-  state.SetBytesProcessed(state.iterations() * state.range_y() * sizeof(int));
+  state.SetItemsProcessed(state.iterations() * state.range(1));
+  state.SetBytesProcessed(state.iterations() * state.range(1) * sizeof(int));
 }
-BENCHMARK(BM_SetInsert)->RangePair(1<<10,8<<10, 1,10);
+BENCHMARK(BM_SetInsert)->Ranges({{1 << 10, 8 << 10}, {1, 10}});
 
-template<typename Container, typename ValueType = typename Container::value_type>
+template <typename Container,
+          typename ValueType = typename Container::value_type>
 static void BM_Sequential(benchmark::State& state) {
   ValueType v = 42;
   while (state.KeepRunning()) {
     Container c;
-    for (int i = state.range_x(); --i; )
-      c.push_back(v);
+    for (int i = state.range(0); --i;) c.push_back(v);
   }
-  const size_t items_processed = state.iterations() * state.range_x();
+  const size_t items_processed = state.iterations() * state.range(0);
   state.SetItemsProcessed(items_processed);
   state.SetBytesProcessed(items_processed * sizeof(v));
 }
-BENCHMARK_TEMPLATE2(BM_Sequential, std::vector<int>, int)->Range(1 << 0, 1 << 10);
+BENCHMARK_TEMPLATE2(BM_Sequential, std::vector<int>, int)
+    ->Range(1 << 0, 1 << 10);
 BENCHMARK_TEMPLATE(BM_Sequential, std::list<int>)->Range(1 << 0, 1 << 10);
 // Test the variadic version of BENCHMARK_TEMPLATE in C++11 and beyond.
 #if __cplusplus >= 201103L
@@ -117,12 +114,11 @@
 #endif
 
 static void BM_StringCompare(benchmark::State& state) {
-  std::string s1(state.range_x(), '-');
-  std::string s2(state.range_x(), '-');
-  while (state.KeepRunning())
-    benchmark::DoNotOptimize(s1.compare(s2));
+  std::string s1(state.range(0), '-');
+  std::string s2(state.range(0), '-');
+  while (state.KeepRunning()) benchmark::DoNotOptimize(s1.compare(s2));
 }
-BENCHMARK(BM_StringCompare)->Range(1, 1<<20);
+BENCHMARK(BM_StringCompare)->Range(1, 1 << 20);
 
 static void BM_SetupTeardown(benchmark::State& state) {
   if (state.thread_index == 0) {
@@ -132,7 +128,7 @@
   int i = 0;
   while (state.KeepRunning()) {
     std::lock_guard<std::mutex> l(test_vector_mu);
-    if (i%2 == 0)
+    if (i % 2 == 0)
       test_vector->push_back(i);
     else
       test_vector->pop_back();
@@ -147,14 +143,14 @@
 static void BM_LongTest(benchmark::State& state) {
   double tracker = 0.0;
   while (state.KeepRunning()) {
-    for (int i = 0; i < state.range_x(); ++i)
+    for (int i = 0; i < state.range(0); ++i)
       benchmark::DoNotOptimize(tracker += i);
   }
 }
-BENCHMARK(BM_LongTest)->Range(1<<16,1<<28);
+BENCHMARK(BM_LongTest)->Range(1 << 16, 1 << 28);
 
 static void BM_ParallelMemset(benchmark::State& state) {
-  int size = state.range_x() / sizeof(int);
+  int size = state.range(0) / sizeof(int);
   int thread_size = size / state.threads;
   int from = thread_size * state.thread_index;
   int to = from + thread_size;
@@ -179,21 +175,19 @@
 
 static void BM_ManualTiming(benchmark::State& state) {
   size_t slept_for = 0;
-  int microseconds = state.range_x();
-  std::chrono::duration<double, std::micro> sleep_duration {
-    static_cast<double>(microseconds)
-  };
+  int microseconds = state.range(0);
+  std::chrono::duration<double, std::micro> sleep_duration{
+      static_cast<double>(microseconds)};
 
   while (state.KeepRunning()) {
-    auto start   = std::chrono::high_resolution_clock::now();
+    auto start = std::chrono::high_resolution_clock::now();
     // Simulate some useful workload with a sleep
-    std::this_thread::sleep_for(std::chrono::duration_cast<
-      std::chrono::nanoseconds>(sleep_duration));
-    auto end     = std::chrono::high_resolution_clock::now();
+    std::this_thread::sleep_for(
+        std::chrono::duration_cast<std::chrono::nanoseconds>(sleep_duration));
+    auto end = std::chrono::high_resolution_clock::now();
 
     auto elapsed =
-      std::chrono::duration_cast<std::chrono::duration<double>>(
-        end - start);
+        std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
 
     state.SetIterationTime(elapsed.count());
     slept_for += microseconds;
@@ -205,20 +199,43 @@
 
 #if __cplusplus >= 201103L
 
-template <class ...Args>
+template <class... Args>
 void BM_with_args(benchmark::State& state, Args&&...) {
-  while (state.KeepRunning()) {}
+  while (state.KeepRunning()) {
+  }
 }
 BENCHMARK_CAPTURE(BM_with_args, int_test, 42, 43, 44);
-BENCHMARK_CAPTURE(BM_with_args, string_and_pair_test,
-                  std::string("abc"), std::pair<int, double>(42, 3.8));
+BENCHMARK_CAPTURE(BM_with_args, string_and_pair_test, std::string("abc"),
+                  std::pair<int, double>(42, 3.8));
 
 void BM_non_template_args(benchmark::State& state, int, double) {
-  while(state.KeepRunning()) {}
+  while (state.KeepRunning()) {
+  }
 }
 BENCHMARK_CAPTURE(BM_non_template_args, basic_test, 0, 0);
 
-#endif // __cplusplus >= 201103L
+#endif  // __cplusplus >= 201103L
+
+static void BM_DenseThreadRanges(benchmark::State& st) {
+  switch (st.range(0)) {
+    case 1:
+      assert(st.threads == 1 || st.threads == 2 || st.threads == 3);
+      break;
+    case 2:
+      assert(st.threads == 1 || st.threads == 3 || st.threads == 4);
+      break;
+    case 3:
+      assert(st.threads == 5 || st.threads == 8 || st.threads == 11 ||
+             st.threads == 14);
+      break;
+    default:
+      assert(false && "Invalid test case number");
+  }
+  while (st.KeepRunning()) {
+  }
+}
+BENCHMARK(BM_DenseThreadRanges)->Arg(1)->DenseThreadRange(1, 3);
+BENCHMARK(BM_DenseThreadRanges)->Arg(2)->DenseThreadRange(1, 4, 2);
+BENCHMARK(BM_DenseThreadRanges)->Arg(3)->DenseThreadRange(5, 14, 3);
 
 BENCHMARK_MAIN()
-
diff --git a/test/complexity_test.cc b/test/complexity_test.cc
index 8ab88f9..14e03b0 100644
--- a/test/complexity_test.cc
+++ b/test/complexity_test.cc
@@ -1,145 +1,41 @@
-
 #undef NDEBUG
-#include "benchmark/benchmark.h"
-#include "../src/check.h" // NOTE: check.h is for internal use only!
-#include "../src/re.h"    // NOTE: re.h is for internal use only
-#include <cassert>
-#include <cstring>
-#include <iostream>
-#include <sstream>
-#include <vector>
-#include <utility>
 #include <algorithm>
+#include <cassert>
 #include <cmath>
+#include <cstdlib>
+#include <vector>
+#include "benchmark/benchmark.h"
+#include "output_test.h"
 
 namespace {
 
-// ========================================================================= //
-// -------------------------- Testing Case --------------------------------- //
-// ========================================================================= //
-
-enum MatchRules {
-  MR_Default, // Skip non-matching lines until a match is found.
-  MR_Next    // Match must occur on the next line.
-};
-
-struct TestCase {
-  std::string regex;
-  int match_rule;
-
-  TestCase(std::string re, int rule = MR_Default) : regex(re), match_rule(rule) {}
-
-  void Check(std::stringstream& remaining_output) const {
-    benchmark::Regex r;
-    std::string err_str;
-    r.Init(regex, &err_str);
-    CHECK(err_str.empty()) << "Could not construct regex \"" << regex << "\""
-                           << " got Error: " << err_str;
-
-    std::string line;
-    while (remaining_output.eof() == false) {
-        CHECK(remaining_output.good());
-        std::getline(remaining_output, line);
-        if (r.Match(line)) return;
-        CHECK(match_rule != MR_Next) << "Expected line \"" << line
-                                     << "\" to match regex \"" << regex << "\"";
-    }
-
-    CHECK(remaining_output.eof() == false)
-        << "End of output reached before match for regex \"" << regex
-        << "\" was found";
-  }
-};
-
-std::vector<TestCase> ConsoleOutputTests;
-std::vector<TestCase> JSONOutputTests;
-std::vector<TestCase> CSVOutputTests;
-
-// ========================================================================= //
-// -------------------------- Test Helpers --------------------------------- //
-// ========================================================================= //
-
-class TestReporter : public benchmark::BenchmarkReporter {
-public:
-  TestReporter(std::vector<benchmark::BenchmarkReporter*> reps)
-      : reporters_(reps)  {}
-
-  virtual bool ReportContext(const Context& context) {
-    bool last_ret = false;
-    bool first = true;
-    for (auto rep : reporters_) {
-      bool new_ret = rep->ReportContext(context);
-      CHECK(first || new_ret == last_ret)
-          << "Reports return different values for ReportContext";
-      first = false;
-      last_ret = new_ret;
-    }
-    return last_ret;
-  }
-
-  virtual void ReportRuns(const std::vector<Run>& report) {
-    for (auto rep : reporters_)
-      rep->ReportRuns(report);
-  }
-
-  virtual void Finalize() {
-      for (auto rep : reporters_)
-        rep->Finalize();
-  }
-
-private:
-  std::vector<benchmark::BenchmarkReporter*> reporters_;
-};
-
-
-#define CONCAT2(x, y) x##y
-#define CONCAT(x, y) CONCAT2(x, y)
-
-#define ADD_CASES(...) \
-    int CONCAT(dummy, __LINE__) = AddCases(__VA_ARGS__)
-
-int AddCases(std::vector<TestCase>* out, std::initializer_list<TestCase> const& v) {
-  for (auto const& TC : v)
-    out->push_back(TC);
-  return 0;
-}
-
-template <class First>
-std::string join(First f) { return f; }
-
-template <class First, class ...Args>
-std::string join(First f, Args&&... args) {
-    return std::string(std::move(f)) + "[ ]+" + join(std::forward<Args>(args)...);
-}
-
-std::string dec_re = "[0-9]+\\.[0-9]+";
-
 #define ADD_COMPLEXITY_CASES(...) \
-    int CONCAT(dummy, __LINE__) = AddComplexityTest(__VA_ARGS__)
+  int CONCAT(dummy, __LINE__) = AddComplexityTest(__VA_ARGS__)
 
-int AddComplexityTest(std::vector<TestCase>* console_out, std::vector<TestCase>* json_out,
-                      std::vector<TestCase>* csv_out, std::string big_o_test_name, 
-                      std::string rms_test_name, std::string big_o) {
-  std::string big_o_str = dec_re + " " + big_o;
-  AddCases(console_out, {
-    {join("^" + big_o_test_name + "", big_o_str, big_o_str) + "[ ]*$"},
-    {join("^" + rms_test_name + "", "[0-9]+ %", "[0-9]+ %") + "[ ]*$"}
-  });
-  AddCases(json_out, {
-    {"\"name\": \"" + big_o_test_name + "\",$"},
-    {"\"cpu_coefficient\": [0-9]+,$", MR_Next},
-    {"\"real_coefficient\": [0-9]{1,5},$", MR_Next},
-    {"\"big_o\": \"" + big_o + "\",$", MR_Next},
-    {"\"time_unit\": \"ns\"$", MR_Next},
-    {"}", MR_Next},
-    {"\"name\": \"" + rms_test_name + "\",$"},
-    {"\"rms\": [0-9]+%$", MR_Next},
-    {"}", MR_Next}
-  });
-  AddCases(csv_out, {
-    {"^\"" + big_o_test_name + "\",," + dec_re + "," + dec_re + "," + big_o + ",,,,,$"},
-    {"^\"" + rms_test_name + "\",," + dec_re + "," + dec_re + ",,,,,,$"}
-  });
+int AddComplexityTest(std::string big_o_test_name, std::string rms_test_name,
+                      std::string big_o) {
+  SetSubstitutions({{"%bigo_name", big_o_test_name},
+                    {"%rms_name", rms_test_name},
+                    {"%bigo_str", "[ ]* %float " + big_o},
+                    {"%bigo", big_o},
+                    {"%rms", "[ ]*[0-9]+ %"}});
+  AddCases(
+      TC_ConsoleOut,
+      {{"^%bigo_name %bigo_str %bigo_str[ ]*$"},
+       {"^%bigo_name", MR_Not},  // Assert we we didn't only matched a name.
+       {"^%rms_name %rms %rms[ ]*$", MR_Next}});
+  AddCases(TC_JSONOut, {{"\"name\": \"%bigo_name\",$"},
+                        {"\"cpu_coefficient\": [0-9]+,$", MR_Next},
+                        {"\"real_coefficient\": [0-9]{1,5},$", MR_Next},
+                        {"\"big_o\": \"%bigo\",$", MR_Next},
+                        {"\"time_unit\": \"ns\"$", MR_Next},
+                        {"}", MR_Next},
+                        {"\"name\": \"%rms_name\",$"},
+                        {"\"rms\": %float$", MR_Next},
+                        {"}", MR_Next}});
+  AddCases(TC_CSVOut, {{"^\"%bigo_name\",,%float,%float,%bigo,,,,,$"},
+                       {"^\"%bigo_name\"", MR_Not},
+                       {"^\"%rms_name\",,%float,%float,,,,,,$", MR_Next}});
   return 0;
 }
 
@@ -151,25 +47,35 @@
 
 void BM_Complexity_O1(benchmark::State& state) {
   while (state.KeepRunning()) {
+    for (int i = 0; i < 1024; ++i) {
+      benchmark::DoNotOptimize(&i);
+    }
   }
-  state.SetComplexityN(state.range_x());
+  state.SetComplexityN(state.range(0));
 }
-BENCHMARK(BM_Complexity_O1) -> Range(1, 1<<18) -> Complexity(benchmark::o1);
-BENCHMARK(BM_Complexity_O1) -> Range(1, 1<<18) -> Complexity([](int){return 1.0; });
-BENCHMARK(BM_Complexity_O1) -> Range(1, 1<<18) -> Complexity();
+BENCHMARK(BM_Complexity_O1)->Range(1, 1 << 18)->Complexity(benchmark::o1);
+BENCHMARK(BM_Complexity_O1)->Range(1, 1 << 18)->Complexity();
+BENCHMARK(BM_Complexity_O1)->Range(1, 1 << 18)->Complexity([](int) {
+  return 1.0;
+});
 
-const char* big_o_1_test_name = "BM_Complexity_O1_BigO";
-const char* rms_o_1_test_name = "BM_Complexity_O1_RMS";
-const char* enum_auto_big_o_1 = "\\([0-9]+\\)";
-const char* lambda_big_o_1 = "f\\(N\\)";
+const char *big_o_1_test_name = "BM_Complexity_O1_BigO";
+const char *rms_o_1_test_name = "BM_Complexity_O1_RMS";
+const char *enum_big_o_1 = "\\([0-9]+\\)";
+// FIXME: Tolerate both '(1)' and 'lgN' as output when the complexity is auto
+// deduced.
+// See https://github.com/google/benchmark/issues/272
+const char *auto_big_o_1 = "(\\([0-9]+\\))|(lgN)";
+const char *lambda_big_o_1 = "f\\(N\\)";
 
 // Add enum tests
-ADD_COMPLEXITY_CASES(&ConsoleOutputTests, &JSONOutputTests, &CSVOutputTests, 
-                     big_o_1_test_name, rms_o_1_test_name, enum_auto_big_o_1);
+ADD_COMPLEXITY_CASES(big_o_1_test_name, rms_o_1_test_name, enum_big_o_1);
+
+// Add auto enum tests
+ADD_COMPLEXITY_CASES(big_o_1_test_name, rms_o_1_test_name, auto_big_o_1);
 
 // Add lambda tests
-ADD_COMPLEXITY_CASES(&ConsoleOutputTests, &JSONOutputTests, &CSVOutputTests, 
-                     big_o_1_test_name, rms_o_1_test_name, lambda_big_o_1);
+ADD_COMPLEXITY_CASES(big_o_1_test_name, rms_o_1_test_name, lambda_big_o_1);
 
 // ========================================================================= //
 // --------------------------- Testing BigO O(N) --------------------------- //
@@ -179,119 +85,83 @@
   std::vector<int> v;
   v.reserve(size);
   for (int i = 0; i < size; ++i) {
-    v.push_back(rand() % size);
+    v.push_back(std::rand() % size);
   }
   return v;
 }
 
 void BM_Complexity_O_N(benchmark::State& state) {
-  auto v = ConstructRandomVector(state.range_x());
-  const int item_not_in_vector = state.range_x()*2; // Test worst case scenario (item not in vector)
+  auto v = ConstructRandomVector(state.range(0));
+  const int item_not_in_vector =
+      state.range(0) * 2;  // Test worst case scenario (item not in vector)
   while (state.KeepRunning()) {
-      benchmark::DoNotOptimize(std::find(v.begin(), v.end(), item_not_in_vector));
+    benchmark::DoNotOptimize(std::find(v.begin(), v.end(), item_not_in_vector));
   }
-  state.SetComplexityN(state.range_x());
+  state.SetComplexityN(state.range(0));
 }
-BENCHMARK(BM_Complexity_O_N) -> RangeMultiplier(2) -> Range(1<<10, 1<<16) -> Complexity(benchmark::oN);
-BENCHMARK(BM_Complexity_O_N) -> RangeMultiplier(2) -> Range(1<<10, 1<<16) -> Complexity([](int n) -> double{return n; });
-BENCHMARK(BM_Complexity_O_N) -> RangeMultiplier(2) -> Range(1<<10, 1<<16) -> Complexity();
+BENCHMARK(BM_Complexity_O_N)
+    ->RangeMultiplier(2)
+    ->Range(1 << 10, 1 << 16)
+    ->Complexity(benchmark::oN);
+BENCHMARK(BM_Complexity_O_N)
+    ->RangeMultiplier(2)
+    ->Range(1 << 10, 1 << 16)
+    ->Complexity([](int n) -> double { return n; });
+BENCHMARK(BM_Complexity_O_N)
+    ->RangeMultiplier(2)
+    ->Range(1 << 10, 1 << 16)
+    ->Complexity();
 
-const char* big_o_n_test_name = "BM_Complexity_O_N_BigO";
-const char* rms_o_n_test_name = "BM_Complexity_O_N_RMS";
-const char* enum_auto_big_o_n = "N";
-const char* lambda_big_o_n = "f\\(N\\)";
+const char *big_o_n_test_name = "BM_Complexity_O_N_BigO";
+const char *rms_o_n_test_name = "BM_Complexity_O_N_RMS";
+const char *enum_auto_big_o_n = "N";
+const char *lambda_big_o_n = "f\\(N\\)";
 
 // Add enum tests
-ADD_COMPLEXITY_CASES(&ConsoleOutputTests, &JSONOutputTests, &CSVOutputTests, 
-                     big_o_n_test_name, rms_o_n_test_name, enum_auto_big_o_n);
+ADD_COMPLEXITY_CASES(big_o_n_test_name, rms_o_n_test_name, enum_auto_big_o_n);
 
 // Add lambda tests
-ADD_COMPLEXITY_CASES(&ConsoleOutputTests, &JSONOutputTests, &CSVOutputTests, 
-                     big_o_n_test_name, rms_o_n_test_name, lambda_big_o_n);
+ADD_COMPLEXITY_CASES(big_o_n_test_name, rms_o_n_test_name, lambda_big_o_n);
 
 // ========================================================================= //
 // ------------------------- Testing BigO O(N*lgN) ------------------------- //
 // ========================================================================= //
 
 static void BM_Complexity_O_N_log_N(benchmark::State& state) {
-  auto v = ConstructRandomVector(state.range_x());
+  auto v = ConstructRandomVector(state.range(0));
   while (state.KeepRunning()) {
-      std::sort(v.begin(), v.end());
+    std::sort(v.begin(), v.end());
   }
-  state.SetComplexityN(state.range_x());
+  state.SetComplexityN(state.range(0));
 }
-BENCHMARK(BM_Complexity_O_N_log_N) -> RangeMultiplier(2) -> Range(1<<10, 1<<16) -> Complexity(benchmark::oNLogN);
-BENCHMARK(BM_Complexity_O_N_log_N) -> RangeMultiplier(2) -> Range(1<<10, 1<<16) -> Complexity([](int n) {return n * std::log2(n); });
-BENCHMARK(BM_Complexity_O_N_log_N) -> RangeMultiplier(2) -> Range(1<<10, 1<<16) -> Complexity();
+BENCHMARK(BM_Complexity_O_N_log_N)
+    ->RangeMultiplier(2)
+    ->Range(1 << 10, 1 << 16)
+    ->Complexity(benchmark::oNLogN);
+BENCHMARK(BM_Complexity_O_N_log_N)
+    ->RangeMultiplier(2)
+    ->Range(1 << 10, 1 << 16)
+    ->Complexity([](int n) { return n * std::log2(n); });
+BENCHMARK(BM_Complexity_O_N_log_N)
+    ->RangeMultiplier(2)
+    ->Range(1 << 10, 1 << 16)
+    ->Complexity();
 
-const char* big_o_n_lg_n_test_name = "BM_Complexity_O_N_log_N_BigO";
-const char* rms_o_n_lg_n_test_name = "BM_Complexity_O_N_log_N_RMS";
-const char* enum_auto_big_o_n_lg_n = "NlgN";
-const char* lambda_big_o_n_lg_n = "f\\(N\\)";
+const char *big_o_n_lg_n_test_name = "BM_Complexity_O_N_log_N_BigO";
+const char *rms_o_n_lg_n_test_name = "BM_Complexity_O_N_log_N_RMS";
+const char *enum_auto_big_o_n_lg_n = "NlgN";
+const char *lambda_big_o_n_lg_n = "f\\(N\\)";
 
 // Add enum tests
-ADD_COMPLEXITY_CASES(&ConsoleOutputTests, &JSONOutputTests, &CSVOutputTests, 
-                     big_o_n_lg_n_test_name, rms_o_n_lg_n_test_name, enum_auto_big_o_n_lg_n);
+ADD_COMPLEXITY_CASES(big_o_n_lg_n_test_name, rms_o_n_lg_n_test_name,
+                     enum_auto_big_o_n_lg_n);
 
 // Add lambda tests
-ADD_COMPLEXITY_CASES(&ConsoleOutputTests, &JSONOutputTests, &CSVOutputTests, 
-                     big_o_n_lg_n_test_name, rms_o_n_lg_n_test_name, lambda_big_o_n_lg_n);
-
+ADD_COMPLEXITY_CASES(big_o_n_lg_n_test_name, rms_o_n_lg_n_test_name,
+                     lambda_big_o_n_lg_n);
 
 // ========================================================================= //
 // --------------------------- TEST CASES END ------------------------------ //
 // ========================================================================= //
 
-
-int main(int argc, char* argv[]) {
-  // Add --color_print=false to argv since we don't want to match color codes.
-  char new_arg[64];
-  char* new_argv[64];
-  std::copy(argv, argv + argc, new_argv);
-  new_argv[argc++] = std::strcpy(new_arg, "--color_print=false");
-  benchmark::Initialize(&argc, new_argv);
-
-  benchmark::ConsoleReporter CR;
-  benchmark::JSONReporter JR;
-  benchmark::CSVReporter CSVR;
-  struct ReporterTest {
-    const char* name;
-    std::vector<TestCase>& output_cases;
-    benchmark::BenchmarkReporter& reporter;
-    std::stringstream out_stream;
-    std::stringstream err_stream;
-
-    ReporterTest(const char* n,
-                 std::vector<TestCase>& out_tc,
-                 benchmark::BenchmarkReporter& br)
-        : name(n), output_cases(out_tc), reporter(br) {
-        reporter.SetOutputStream(&out_stream);
-        reporter.SetErrorStream(&err_stream);
-    }
-  } TestCases[] = {
-      {"ConsoleReporter", ConsoleOutputTests, CR},
-      {"JSONReporter", JSONOutputTests, JR},
-      {"CSVReporter", CSVOutputTests, CSVR}
-  };
-
-  // Create the test reporter and run the benchmarks.
-  std::cout << "Running benchmarks...\n";
-  TestReporter test_rep({&CR, &JR, &CSVR});
-  benchmark::RunSpecifiedBenchmarks(&test_rep);
-
-  for (auto& rep_test : TestCases) {
-      std::string msg = std::string("\nTesting ") + rep_test.name + " Output\n";
-      std::string banner(msg.size() - 1, '-');
-      std::cout << banner << msg << banner << "\n";
-
-      std::cerr << rep_test.err_stream.str();
-      std::cout << rep_test.out_stream.str();
-
-      for (const auto& TC : rep_test.output_cases)
-        TC.Check(rep_test.out_stream);
-
-      std::cout << "\n";
-  }
-  return 0;
-}
-
+int main(int argc, char *argv[]) { RunOutputTests(argc, argv); }
diff --git a/test/cxx03_test.cc b/test/cxx03_test.cc
index 56779d6..4f3d0fb 100644
--- a/test/cxx03_test.cc
+++ b/test/cxx03_test.cc
@@ -1,4 +1,5 @@
-
+#undef NDEBUG
+#include <cassert>
 #include <cstddef>
 
 #include "benchmark/benchmark.h"
@@ -8,22 +9,32 @@
 #endif
 
 void BM_empty(benchmark::State& state) {
-    while (state.KeepRunning()) {
-        volatile std::size_t x = state.iterations();
-        ((void)x);
-    }
+  while (state.KeepRunning()) {
+    volatile std::size_t x = state.iterations();
+    ((void)x);
+  }
 }
 BENCHMARK(BM_empty);
 
+// The new C++11 interface for args/ranges requires initializer list support.
+// Therefore we provide the old interface to support C++03.
+void BM_old_arg_range_interface(benchmark::State& state) {
+  assert((state.range(0) == 1 && state.range(1) == 2) ||
+         (state.range(0) == 5 && state.range(1) == 6));
+  while (state.KeepRunning()) {
+  }
+}
+BENCHMARK(BM_old_arg_range_interface)->ArgPair(1, 2)->RangePair(5, 5, 6, 6);
+
 template <class T, class U>
 void BM_template2(benchmark::State& state) {
-    BM_empty(state);
+  BM_empty(state);
 }
 BENCHMARK_TEMPLATE2(BM_template2, int, long);
 
 template <class T>
 void BM_template1(benchmark::State& state) {
-    BM_empty(state);
+  BM_empty(state);
 }
 BENCHMARK_TEMPLATE(BM_template1, long);
 BENCHMARK_TEMPLATE1(BM_template1, int);
diff --git a/test/diagnostics_test.cc b/test/diagnostics_test.cc
index 60fa3b1..c6c235d 100644
--- a/test/diagnostics_test.cc
+++ b/test/diagnostics_test.cc
@@ -7,10 +7,11 @@
 // NOTE: Users should NOT include or use src/check.h. This is only done in
 // order to test library internals.
 
-#include "benchmark/benchmark_api.h"
-#include "../src/check.h"
-#include <stdexcept>
 #include <cstdlib>
+#include <stdexcept>
+
+#include "../src/check.h"
+#include "benchmark/benchmark_api.h"
 
 #if defined(__GNUC__) && !defined(__EXCEPTIONS)
 #define TEST_HAS_NO_EXCEPTIONS
@@ -29,13 +30,15 @@
   try {
     state.PauseTiming();
     std::abort();
-  } catch (std::logic_error const&) {}
+  } catch (std::logic_error const&) {
+  }
   try {
     state.ResumeTiming();
     std::abort();
-  } catch (std::logic_error const&) {}
+  } catch (std::logic_error const&) {
+  }
 #else
-  (void)state; // avoid unused warning
+  (void)state;  // avoid unused warning
 #endif
 }
 
@@ -54,7 +57,7 @@
 }
 BENCHMARK(BM_diagnostic_test);
 
-int main(int argc, char** argv) {
+int main(int argc, char* argv[]) {
   benchmark::internal::GetAbortHandler() = &TestHandler;
   benchmark::Initialize(&argc, argv);
   benchmark::RunSpecifiedBenchmarks();
diff --git a/test/donotoptimize_test.cc b/test/donotoptimize_test.cc
index e4453fb..b21187a 100644
--- a/test/donotoptimize_test.cc
+++ b/test/donotoptimize_test.cc
@@ -4,15 +4,12 @@
 
 namespace {
 #if defined(__GNUC__)
-  std::uint64_t double_up(const std::uint64_t x) __attribute__ ((const));
+std::uint64_t double_up(const std::uint64_t x) __attribute__((const));
 #endif
-  std::uint64_t double_up(const std::uint64_t x) {
-    return x * 2;
-  }
+std::uint64_t double_up(const std::uint64_t x) { return x * 2; }
 }
 
 int main(int, char*[]) {
-
   // this test verifies compilation of DoNotOptimize() for some types
 
   char buffer8[8];
diff --git a/test/filter_test.cc b/test/filter_test.cc
index 0ba4071..3a20529 100644
--- a/test/filter_test.cc
+++ b/test/filter_test.cc
@@ -27,9 +27,7 @@
 
   virtual ~TestReporter() {}
 
-  size_t GetCount() const {
-    return count_;
-  }
+  size_t GetCount() const { return count_; }
 
  private:
   mutable size_t count_;
@@ -37,46 +35,47 @@
 
 }  // end namespace
 
-
 static void NoPrefix(benchmark::State& state) {
-  while (state.KeepRunning()) {}
+  while (state.KeepRunning()) {
+  }
 }
 BENCHMARK(NoPrefix);
 
 static void BM_Foo(benchmark::State& state) {
-  while (state.KeepRunning()) {}
+  while (state.KeepRunning()) {
+  }
 }
 BENCHMARK(BM_Foo);
 
-
 static void BM_Bar(benchmark::State& state) {
-  while (state.KeepRunning()) {}
+  while (state.KeepRunning()) {
+  }
 }
 BENCHMARK(BM_Bar);
 
-
 static void BM_FooBar(benchmark::State& state) {
-  while (state.KeepRunning()) {}
+  while (state.KeepRunning()) {
+  }
 }
 BENCHMARK(BM_FooBar);
 
-
 static void BM_FooBa(benchmark::State& state) {
-  while (state.KeepRunning()) {}
+  while (state.KeepRunning()) {
+  }
 }
 BENCHMARK(BM_FooBa);
 
-
-
-int main(int argc, char** argv) {
+int main(int argc, char **argv) {
   bool list_only = false;
-  for (int i=0; i < argc; ++i)
-    list_only |= std::string(argv[i]).find("--benchmark_list_tests") != std::string::npos;
+  for (int i = 0; i < argc; ++i)
+    list_only |= std::string(argv[i]).find("--benchmark_list_tests") !=
+                 std::string::npos;
 
   benchmark::Initialize(&argc, argv);
 
   TestReporter test_reporter;
-  const size_t returned_count = benchmark::RunSpecifiedBenchmarks(&test_reporter);
+  const size_t returned_count =
+      benchmark::RunSpecifiedBenchmarks(&test_reporter);
 
   if (argc == 2) {
     // Make sure we ran all of the tests
diff --git a/test/fixture_test.cc b/test/fixture_test.cc
index bf800fd..bbc2f95 100644
--- a/test/fixture_test.cc
+++ b/test/fixture_test.cc
@@ -20,15 +20,12 @@
     }
   }
 
-  ~MyFixture() {
-    assert(data == nullptr);
-  }
+  ~MyFixture() { assert(data == nullptr); }
 
   std::unique_ptr<int> data;
 };
 
-
-BENCHMARK_F(MyFixture, Foo)(benchmark::State& st) {
+BENCHMARK_F(MyFixture, Foo)(benchmark::State &st) {
   assert(data.get() != nullptr);
   assert(*data == 42);
   while (st.KeepRunning()) {
@@ -44,7 +41,7 @@
     assert(data.get() != nullptr);
     assert(*data == 42);
   }
-  st.SetItemsProcessed(st.range_x());
+  st.SetItemsProcessed(st.range(0));
 }
 BENCHMARK_REGISTER_F(MyFixture, Bar)->Arg(42);
 BENCHMARK_REGISTER_F(MyFixture, Bar)->Arg(42)->ThreadPerCpu();
diff --git a/test/map_test.cc b/test/map_test.cc
index 5eccf8d..83457c9 100644
--- a/test/map_test.cc
+++ b/test/map_test.cc
@@ -17,7 +17,7 @@
 
 // Basic version.
 static void BM_MapLookup(benchmark::State& state) {
-  const int size = state.range_x();
+  const int size = state.range(0);
   while (state.KeepRunning()) {
     state.PauseTiming();
     std::map<int, int> m = ConstructRandomMap(size);
@@ -34,18 +34,16 @@
 class MapFixture : public ::benchmark::Fixture {
  public:
   void SetUp(const ::benchmark::State& st) {
-    m = ConstructRandomMap(st.range_x());
+    m = ConstructRandomMap(st.range(0));
   }
 
-  void TearDown(const ::benchmark::State&) {
-    m.clear();
-  }
+  void TearDown(const ::benchmark::State&) { m.clear(); }
 
   std::map<int, int> m;
 };
 
 BENCHMARK_DEFINE_F(MapFixture, Lookup)(benchmark::State& state) {
-  const int size = state.range_x();
+  const int size = state.range(0);
   while (state.KeepRunning()) {
     for (int i = 0; i < size; ++i) {
       benchmark::DoNotOptimize(m.find(rand() % size));
@@ -53,6 +51,6 @@
   }
   state.SetItemsProcessed(state.iterations() * size);
 }
-BENCHMARK_REGISTER_F(MapFixture, Lookup)->Range(1<<3, 1<<12);
+BENCHMARK_REGISTER_F(MapFixture, Lookup)->Range(1 << 3, 1 << 12);
 
 BENCHMARK_MAIN()
diff --git a/test/multiple_ranges_test.cc b/test/multiple_ranges_test.cc
new file mode 100644
index 0000000..8e67b3b
--- /dev/null
+++ b/test/multiple_ranges_test.cc
@@ -0,0 +1,74 @@
+#include "benchmark/benchmark.h"
+
+#include <cassert>
+#include <set>
+
+class MultipleRangesFixture : public ::benchmark::Fixture {
+ public:
+  MultipleRangesFixture()
+      : expectedValues({{1, 3, 5},
+                        {1, 3, 8},
+                        {1, 3, 15},
+                        {2, 3, 5},
+                        {2, 3, 8},
+                        {2, 3, 15},
+                        {1, 4, 5},
+                        {1, 4, 8},
+                        {1, 4, 15},
+                        {2, 4, 5},
+                        {2, 4, 8},
+                        {2, 4, 15},
+                        {1, 7, 5},
+                        {1, 7, 8},
+                        {1, 7, 15},
+                        {2, 7, 5},
+                        {2, 7, 8},
+                        {2, 7, 15},
+                        {7, 6, 3}}) {}
+
+  void SetUp(const ::benchmark::State& state) {
+    std::vector<int> ranges = {state.range(0), state.range(1), state.range(2)};
+
+    assert(expectedValues.find(ranges) != expectedValues.end());
+
+    actualValues.insert(ranges);
+  }
+
+  virtual ~MultipleRangesFixture() {
+    assert(actualValues.size() == expectedValues.size());
+  }
+
+  std::set<std::vector<int>> expectedValues;
+  std::set<std::vector<int>> actualValues;
+};
+
+BENCHMARK_DEFINE_F(MultipleRangesFixture, Empty)(benchmark::State& state) {
+  while (state.KeepRunning()) {
+    int product = state.range(0) * state.range(1) * state.range(2);
+    for (int x = 0; x < product; x++) {
+      benchmark::DoNotOptimize(x);
+    }
+  }
+}
+
+BENCHMARK_REGISTER_F(MultipleRangesFixture, Empty)
+    ->RangeMultiplier(2)
+    ->Ranges({{1, 2}, {3, 7}, {5, 15}})
+    ->Args({7, 6, 3});
+
+void BM_CheckDefaultArgument(benchmark::State& state) {
+  // Test that the 'range()' without an argument is the same as 'range(0)'.
+  assert(state.range() == state.range(0));
+  assert(state.range() != state.range(1));
+  while (state.KeepRunning()) {
+  }
+}
+BENCHMARK(BM_CheckDefaultArgument)->Ranges({{1, 5}, {6, 10}});
+
+static void BM_MultipleRanges(benchmark::State& st) {
+  while (st.KeepRunning()) {
+  }
+}
+BENCHMARK(BM_MultipleRanges)->Ranges({{5, 5}, {6, 6}});
+
+BENCHMARK_MAIN()
diff --git a/test/options_test.cc b/test/options_test.cc
index 78cedae..bedb1cc 100644
--- a/test/options_test.cc
+++ b/test/options_test.cc
@@ -9,11 +9,10 @@
 }
 
 void BM_basic_slow(benchmark::State& state) {
-  std::chrono::milliseconds sleep_duration(state.range_x());
+  std::chrono::milliseconds sleep_duration(state.range(0));
   while (state.KeepRunning()) {
     std::this_thread::sleep_for(
-      std::chrono::duration_cast<std::chrono::nanoseconds>(sleep_duration)
-      );
+        std::chrono::duration_cast<std::chrono::nanoseconds>(sleep_duration));
   }
 }
 
@@ -25,8 +24,8 @@
 BENCHMARK(BM_basic)->Range(1, 8);
 BENCHMARK(BM_basic)->RangeMultiplier(2)->Range(1, 8);
 BENCHMARK(BM_basic)->DenseRange(10, 15);
-BENCHMARK(BM_basic)->ArgPair(42, 42);
-BENCHMARK(BM_basic)->RangePair(64, 512, 64, 512);
+BENCHMARK(BM_basic)->Args({42, 42});
+BENCHMARK(BM_basic)->Ranges({{64, 512}, {64, 512}});
 BENCHMARK(BM_basic)->MinTime(0.7);
 BENCHMARK(BM_basic)->UseRealTime();
 BENCHMARK(BM_basic)->ThreadRange(2, 4);
diff --git a/test/output_test.h b/test/output_test.h
new file mode 100644
index 0000000..57d4397
--- /dev/null
+++ b/test/output_test.h
@@ -0,0 +1,71 @@
+#ifndef TEST_OUTPUT_TEST_H
+#define TEST_OUTPUT_TEST_H
+
+#undef NDEBUG
+#include <initializer_list>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "../src/re.h"
+#include "benchmark/benchmark.h"
+
+#define CONCAT2(x, y) x##y
+#define CONCAT(x, y) CONCAT2(x, y)
+
+#define ADD_CASES(...) int CONCAT(dummy, __LINE__) = ::AddCases(__VA_ARGS__)
+
+#define SET_SUBSTITUTIONS(...) \
+  int CONCAT(dummy, __LINE__) = ::SetSubstitutions(__VA_ARGS__)
+
+enum MatchRules {
+  MR_Default,  // Skip non-matching lines until a match is found.
+  MR_Next,     // Match must occur on the next line.
+  MR_Not  // No line between the current position and the next match matches
+          // the regex
+};
+
+struct TestCase {
+  TestCase(std::string re, int rule = MR_Default);
+
+  std::string regex_str;
+  int match_rule;
+  std::string substituted_regex;
+  std::shared_ptr<benchmark::Regex> regex;
+};
+
+enum TestCaseID {
+  TC_ConsoleOut,
+  TC_ConsoleErr,
+  TC_JSONOut,
+  TC_JSONErr,
+  TC_CSVOut,
+  TC_CSVErr,
+
+  TC_NumID  // PRIVATE
+};
+
+// Add a list of test cases to be run against the output specified by
+// 'ID'
+int AddCases(TestCaseID ID, std::initializer_list<TestCase> il);
+
+// Add or set a list of substitutions to be performed on constructed regex's
+// See 'output_test_helper.cc' for a list of default substitutions.
+int SetSubstitutions(
+    std::initializer_list<std::pair<std::string, std::string>> il);
+
+// Run all output tests.
+void RunOutputTests(int argc, char* argv[]);
+
+// ========================================================================= //
+// --------------------------- Misc Utilities ------------------------------ //
+// ========================================================================= //
+
+namespace {
+
+const char* const dec_re = "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?";
+
+}  //  end namespace
+
+#endif  // TEST_OUTPUT_TEST_H
diff --git a/test/output_test_helper.cc b/test/output_test_helper.cc
new file mode 100644
index 0000000..721d39f
--- /dev/null
+++ b/test/output_test_helper.cc
@@ -0,0 +1,234 @@
+#include <iostream>
+#include <map>
+#include <memory>
+#include <sstream>
+
+#include "../src/check.h"  // NOTE: check.h is for internal use only!
+#include "../src/re.h"     // NOTE: re.h is for internal use only
+#include "output_test.h"
+
+// ========================================================================= //
+// ------------------------------ Internals -------------------------------- //
+// ========================================================================= //
+namespace internal {
+namespace {
+
+using TestCaseList = std::vector<TestCase>;
+
+// Use a vector because the order elements are added matters during iteration.
+// std::map/unordered_map don't guarantee that.
+// For example:
+//  SetSubstitutions({{"%HelloWorld", "Hello"}, {"%Hello", "Hi"}});
+//     Substitute("%HelloWorld") // Always expands to Hello.
+using SubMap = std::vector<std::pair<std::string, std::string>>;
+
+TestCaseList& GetTestCaseList(TestCaseID ID) {
+  // Uses function-local statics to ensure initialization occurs
+  // before first use.
+  static TestCaseList lists[TC_NumID];
+  return lists[ID];
+}
+
+SubMap& GetSubstitutions() {
+  // Don't use 'dec_re' from header because it may not yet be initialized.
+  static std::string dec_re = "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?";
+  static SubMap map = {
+      {"%float", "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?"},
+      {"%int", "[ ]*[0-9]+"},
+      {" %s ", "[ ]+"},
+      {"%time", "[ ]*[0-9]{1,5} ns"},
+      {"%console_report", "[ ]*[0-9]{1,5} ns [ ]*[0-9]{1,5} ns [ ]*[0-9]+"},
+      {"%console_us_report", "[ ]*[0-9] us [ ]*[0-9] us [ ]*[0-9]+"},
+      {"%csv_report", "[0-9]+," + dec_re + "," + dec_re + ",ns,,,,,"},
+      {"%csv_us_report", "[0-9]+," + dec_re + "," + dec_re + ",us,,,,,"},
+      {"%csv_bytes_report",
+       "[0-9]+," + dec_re + "," + dec_re + ",ns," + dec_re + ",,,,"},
+      {"%csv_items_report",
+       "[0-9]+," + dec_re + "," + dec_re + ",ns,," + dec_re + ",,,"},
+      {"%csv_label_report_begin", "[0-9]+," + dec_re + "," + dec_re + ",ns,,,"},
+      {"%csv_label_report_end", ",,"}};
+  return map;
+}
+
+std::string PerformSubstitutions(std::string source) {
+  SubMap const& subs = GetSubstitutions();
+  using SizeT = std::string::size_type;
+  for (auto const& KV : subs) {
+    SizeT pos;
+    SizeT next_start = 0;
+    while ((pos = source.find(KV.first, next_start)) != std::string::npos) {
+      next_start = pos + KV.second.size();
+      source.replace(pos, KV.first.size(), KV.second);
+    }
+  }
+  return source;
+}
+
+void CheckCase(std::stringstream& remaining_output, TestCase const& TC,
+               TestCaseList const& not_checks) {
+  std::string first_line;
+  bool on_first = true;
+  std::string line;
+  while (remaining_output.eof() == false) {
+    CHECK(remaining_output.good());
+    std::getline(remaining_output, line);
+    if (on_first) {
+      first_line = line;
+      on_first = false;
+    }
+    for (const auto& NC : not_checks) {
+      CHECK(!NC.regex->Match(line))
+          << "Unexpected match for line \"" << line << "\" for MR_Not regex \""
+          << NC.regex_str << "\""
+          << "\n    actual regex string \"" << TC.substituted_regex << "\""
+          << "\n    started matching near: " << first_line;
+    }
+    if (TC.regex->Match(line)) return;
+    CHECK(TC.match_rule != MR_Next)
+        << "Expected line \"" << line << "\" to match regex \"" << TC.regex_str
+        << "\""
+        << "\n    actual regex string \"" << TC.substituted_regex << "\""
+        << "\n    started matching near: " << first_line;
+  }
+  CHECK(remaining_output.eof() == false)
+      << "End of output reached before match for regex \"" << TC.regex_str
+      << "\" was found"
+      << "\n    actual regex string \"" << TC.substituted_regex << "\""
+      << "\n    started matching near: " << first_line;
+}
+
+void CheckCases(TestCaseList const& checks, std::stringstream& output) {
+  std::vector<TestCase> not_checks;
+  for (size_t i = 0; i < checks.size(); ++i) {
+    const auto& TC = checks[i];
+    if (TC.match_rule == MR_Not) {
+      not_checks.push_back(TC);
+      continue;
+    }
+    CheckCase(output, TC, not_checks);
+    not_checks.clear();
+  }
+}
+
+class TestReporter : public benchmark::BenchmarkReporter {
+ public:
+  TestReporter(std::vector<benchmark::BenchmarkReporter*> reps)
+      : reporters_(reps) {}
+
+  virtual bool ReportContext(const Context& context) {
+    bool last_ret = false;
+    bool first = true;
+    for (auto rep : reporters_) {
+      bool new_ret = rep->ReportContext(context);
+      CHECK(first || new_ret == last_ret)
+          << "Reports return different values for ReportContext";
+      first = false;
+      last_ret = new_ret;
+    }
+    (void)first;
+    return last_ret;
+  }
+
+  void ReportRuns(const std::vector<Run>& report) {
+    for (auto rep : reporters_) rep->ReportRuns(report);
+  }
+  void Finalize() {
+    for (auto rep : reporters_) rep->Finalize();
+  }
+
+ private:
+  std::vector<benchmark::BenchmarkReporter *> reporters_;
+};
+}
+}  // end namespace internal
+
+// ========================================================================= //
+// -------------------------- Public API Definitions------------------------ //
+// ========================================================================= //
+
+TestCase::TestCase(std::string re, int rule)
+    : regex_str(std::move(re)),
+      match_rule(rule),
+      substituted_regex(internal::PerformSubstitutions(regex_str)),
+      regex(std::make_shared<benchmark::Regex>()) {
+  std::string err_str;
+  regex->Init(substituted_regex,& err_str);
+  CHECK(err_str.empty()) << "Could not construct regex \"" << substituted_regex
+                         << "\""
+                         << "\n    originally \"" << regex_str << "\""
+                         << "\n    got error: " << err_str;
+}
+
+int AddCases(TestCaseID ID, std::initializer_list<TestCase> il) {
+  auto& L = internal::GetTestCaseList(ID);
+  L.insert(L.end(), il);
+  return 0;
+}
+
+int SetSubstitutions(
+    std::initializer_list<std::pair<std::string, std::string>> il) {
+  auto& subs = internal::GetSubstitutions();
+  for (auto KV : il) {
+    bool exists = false;
+    KV.second = internal::PerformSubstitutions(KV.second);
+    for (auto& EKV : subs) {
+      if (EKV.first == KV.first) {
+        EKV.second = std::move(KV.second);
+        exists = true;
+        break;
+      }
+    }
+    if (!exists) subs.push_back(std::move(KV));
+  }
+  return 0;
+}
+
+void RunOutputTests(int argc, char* argv[]) {
+  using internal::GetTestCaseList;
+  benchmark::Initialize(&argc, argv);
+  benchmark::ConsoleReporter CR(benchmark::ConsoleReporter::OO_None);
+  benchmark::JSONReporter JR;
+  benchmark::CSVReporter CSVR;
+  struct ReporterTest {
+    const char* name;
+    std::vector<TestCase>& output_cases;
+    std::vector<TestCase>& error_cases;
+    benchmark::BenchmarkReporter& reporter;
+    std::stringstream out_stream;
+    std::stringstream err_stream;
+
+    ReporterTest(const char* n, std::vector<TestCase>& out_tc,
+                 std::vector<TestCase>& err_tc,
+                 benchmark::BenchmarkReporter& br)
+        : name(n), output_cases(out_tc), error_cases(err_tc), reporter(br) {
+      reporter.SetOutputStream(&out_stream);
+      reporter.SetErrorStream(&err_stream);
+    }
+  } TestCases[] = {
+      {"ConsoleReporter", GetTestCaseList(TC_ConsoleOut),
+       GetTestCaseList(TC_ConsoleErr), CR},
+      {"JSONReporter", GetTestCaseList(TC_JSONOut), GetTestCaseList(TC_JSONErr),
+       JR},
+      {"CSVReporter", GetTestCaseList(TC_CSVOut), GetTestCaseList(TC_CSVErr),
+       CSVR},
+  };
+
+  // Create the test reporter and run the benchmarks.
+  std::cout << "Running benchmarks...\n";
+  internal::TestReporter test_rep({&CR, &JR, &CSVR});
+  benchmark::RunSpecifiedBenchmarks(&test_rep);
+
+  for (auto& rep_test : TestCases) {
+    std::string msg = std::string("\nTesting ") + rep_test.name + " Output\n";
+    std::string banner(msg.size() - 1, '-');
+    std::cout << banner << msg << banner << "\n";
+
+    std::cerr << rep_test.err_stream.str();
+    std::cout << rep_test.out_stream.str();
+
+    internal::CheckCases(rep_test.error_cases, rep_test.err_stream);
+    internal::CheckCases(rep_test.output_cases, rep_test.out_stream);
+
+    std::cout << "\n";
+  }
+}
diff --git a/test/register_benchmark_test.cc b/test/register_benchmark_test.cc
new file mode 100644
index 0000000..e9f8ea5
--- /dev/null
+++ b/test/register_benchmark_test.cc
@@ -0,0 +1,148 @@
+
+#undef NDEBUG
+#include <cassert>
+#include <vector>
+
+#include "../src/check.h"  // NOTE: check.h is for internal use only!
+#include "benchmark/benchmark.h"
+
+namespace {
+
+class TestReporter : public benchmark::ConsoleReporter {
+ public:
+  virtual void ReportRuns(const std::vector<Run>& report) {
+    all_runs_.insert(all_runs_.end(), begin(report), end(report));
+    ConsoleReporter::ReportRuns(report);
+  }
+
+  std::vector<Run> all_runs_;
+};
+
+struct TestCase {
+  std::string name;
+  const char* label;
+  // Note: not explicit as we rely on it being converted through ADD_CASES.
+  TestCase(const char* xname) : TestCase(xname, nullptr) {}
+  TestCase(const char* xname, const char* xlabel)
+      : name(xname), label(xlabel) {}
+
+  typedef benchmark::BenchmarkReporter::Run Run;
+
+  void CheckRun(Run const& run) const {
+    CHECK(name == run.benchmark_name) << "expected " << name << " got "
+                                      << run.benchmark_name;
+    if (label) {
+      CHECK(run.report_label == label) << "expected " << label << " got "
+                                       << run.report_label;
+    } else {
+      CHECK(run.report_label == "");
+    }
+  }
+};
+
+std::vector<TestCase> ExpectedResults;
+
+int AddCases(std::initializer_list<TestCase> const& v) {
+  for (auto N : v) {
+    ExpectedResults.push_back(N);
+  }
+  return 0;
+}
+
+#define CONCAT(x, y) CONCAT2(x, y)
+#define CONCAT2(x, y) x##y
+#define ADD_CASES(...) int CONCAT(dummy, __LINE__) = AddCases({__VA_ARGS__})
+
+}  // end namespace
+
+typedef benchmark::internal::Benchmark* ReturnVal;
+
+//----------------------------------------------------------------------------//
+// Test RegisterBenchmark with no additional arguments
+//----------------------------------------------------------------------------//
+void BM_function(benchmark::State& state) {
+  while (state.KeepRunning()) {
+  }
+}
+BENCHMARK(BM_function);
+ReturnVal dummy = benchmark::RegisterBenchmark(
+    "BM_function_manual_registration", BM_function);
+ADD_CASES({"BM_function"}, {"BM_function_manual_registration"});
+
+//----------------------------------------------------------------------------//
+// Test RegisterBenchmark with additional arguments
+// Note: GCC <= 4.8 do not support this form of RegisterBenchmark because they
+//       reject the variadic pack expansion of lambda captures.
+//----------------------------------------------------------------------------//
+#ifndef BENCHMARK_HAS_NO_VARIADIC_REGISTER_BENCHMARK
+
+void BM_extra_args(benchmark::State& st, const char* label) {
+  while (st.KeepRunning()) {
+  }
+  st.SetLabel(label);
+}
+int RegisterFromFunction() {
+  std::pair<const char*, const char*> cases[] = {
+      {"test1", "One"}, {"test2", "Two"}, {"test3", "Three"}};
+  for (auto const& c : cases)
+    benchmark::RegisterBenchmark(c.first, &BM_extra_args, c.second);
+  return 0;
+}
+int dummy2 = RegisterFromFunction();
+ADD_CASES({"test1", "One"}, {"test2", "Two"}, {"test3", "Three"});
+
+#endif  // BENCHMARK_HAS_NO_VARIADIC_REGISTER_BENCHMARK
+
+//----------------------------------------------------------------------------//
+// Test RegisterBenchmark with different callable types
+//----------------------------------------------------------------------------//
+
+struct CustomFixture {
+  void operator()(benchmark::State& st) {
+    while (st.KeepRunning()) {
+    }
+  }
+};
+
+void TestRegistrationAtRuntime() {
+#ifdef BENCHMARK_HAS_CXX11
+  {
+    CustomFixture fx;
+    benchmark::RegisterBenchmark("custom_fixture", fx);
+    AddCases({"custom_fixture"});
+  }
+#endif
+#ifndef BENCHMARK_HAS_NO_VARIADIC_REGISTER_BENCHMARK
+  {
+    int x = 42;
+    auto capturing_lam = [=](benchmark::State& st) {
+      while (st.KeepRunning()) {
+      }
+      st.SetLabel(std::to_string(x));
+    };
+    benchmark::RegisterBenchmark("lambda_benchmark", capturing_lam);
+    AddCases({{"lambda_benchmark", "42"}});
+  }
+#endif
+}
+
+int main(int argc, char* argv[]) {
+  TestRegistrationAtRuntime();
+
+  benchmark::Initialize(&argc, argv);
+
+  TestReporter test_reporter;
+  benchmark::RunSpecifiedBenchmarks(&test_reporter);
+
+  typedef benchmark::BenchmarkReporter::Run Run;
+  auto EB = ExpectedResults.begin();
+
+  for (Run const& run : test_reporter.all_runs_) {
+    assert(EB != ExpectedResults.end());
+    EB->CheckRun(run);
+    ++EB;
+  }
+  assert(EB == ExpectedResults.end());
+
+  return 0;
+}
diff --git a/test/reporter_output_test.cc b/test/reporter_output_test.cc
index b3898ac..2e6d2b2 100644
--- a/test/reporter_output_test.cc
+++ b/test/reporter_output_test.cc
@@ -1,182 +1,162 @@
 
 #undef NDEBUG
-#include "benchmark/benchmark.h"
-#include "../src/check.h" // NOTE: check.h is for internal use only!
-#include "../src/re.h" // NOTE: re.h is for internal use only
-#include <cassert>
-#include <cstring>
-#include <iostream>
-#include <sstream>
-#include <vector>
 #include <utility>
 
-namespace {
-
-// ========================================================================= //
-// -------------------------- Testing Case --------------------------------- //
-// ========================================================================= //
-
-enum MatchRules {
-  MR_Default, // Skip non-matching lines until a match is found.
-  MR_Next    // Match must occur on the next line.
-};
-
-struct TestCase {
-  std::string regex;
-  int match_rule;
-
-  TestCase(std::string re, int rule = MR_Default) : regex(re), match_rule(rule) {}
-
-  void Check(std::stringstream& remaining_output) const {
-    benchmark::Regex r;
-    std::string err_str;
-    r.Init(regex, &err_str);
-    CHECK(err_str.empty()) << "Could not construct regex \"" << regex << "\""
-                           << " got Error: " << err_str;
-
-    std::string line;
-    while (remaining_output.eof() == false) {
-        CHECK(remaining_output.good());
-        std::getline(remaining_output, line);
-        if (r.Match(line)) return;
-        CHECK(match_rule != MR_Next) << "Expected line \"" << line
-                                     << "\" to match regex \"" << regex << "\"";
-    }
-
-    CHECK(remaining_output.eof() == false)
-        << "End of output reached before match for regex \"" << regex
-        << "\" was found";
-  }
-};
-
-std::vector<TestCase> ConsoleOutputTests;
-std::vector<TestCase> JSONOutputTests;
-std::vector<TestCase> CSVOutputTests;
-
-std::vector<TestCase> ConsoleErrorTests;
-std::vector<TestCase> JSONErrorTests;
-std::vector<TestCase> CSVErrorTests;
-
-// ========================================================================= //
-// -------------------------- Test Helpers --------------------------------- //
-// ========================================================================= //
-
-class TestReporter : public benchmark::BenchmarkReporter {
-public:
-  TestReporter(std::vector<benchmark::BenchmarkReporter*> reps)
-      : reporters_(reps)  {}
-
-  virtual bool ReportContext(const Context& context) {
-    bool last_ret = false;
-    bool first = true;
-    for (auto rep : reporters_) {
-      bool new_ret = rep->ReportContext(context);
-      CHECK(first || new_ret == last_ret)
-          << "Reports return different values for ReportContext";
-      first = false;
-      last_ret = new_ret;
-    }
-    return last_ret;
-  }
-
-  virtual void ReportRuns(const std::vector<Run>& report) {
-    for (auto rep : reporters_)
-      rep->ReportRuns(report);
-  }
-
-  virtual void Finalize() {
-      for (auto rep : reporters_)
-        rep->Finalize();
-  }
-
-private:
-  std::vector<benchmark::BenchmarkReporter*> reporters_;
-};
-
-
-#define CONCAT2(x, y) x##y
-#define CONCAT(x, y) CONCAT2(x, y)
-
-#define ADD_CASES(...) \
-    int CONCAT(dummy, __LINE__) = AddCases(__VA_ARGS__)
-
-int AddCases(std::vector<TestCase>* out, std::initializer_list<TestCase> const& v) {
-  for (auto const& TC : v)
-    out->push_back(TC);
-  return 0;
-}
-
-template <class First>
-std::string join(First f) { return f; }
-
-template <class First, class ...Args>
-std::string join(First f, Args&&... args) {
-    return std::string(std::move(f)) + "[ ]+" + join(std::forward<Args>(args)...);
-}
-
-std::string dec_re = "[0-9]+\\.[0-9]+";
-
-}  // end namespace
+#include "benchmark/benchmark.h"
+#include "output_test.h"
 
 // ========================================================================= //
 // ---------------------- Testing Prologue Output -------------------------- //
 // ========================================================================= //
 
-ADD_CASES(&ConsoleOutputTests, {
-    {join("^Benchmark", "Time", "CPU", "Iterations$"), MR_Next},
-    {"^[-]+$", MR_Next}
-});
-ADD_CASES(&CSVOutputTests, {
-  {"name,iterations,real_time,cpu_time,time_unit,bytes_per_second,items_per_second,"
-    "label,error_occurred,error_message"}
-});
+ADD_CASES(TC_ConsoleOut, {{"^Benchmark %s Time %s CPU %s Iterations$", MR_Next},
+                          {"^[-]+$", MR_Next}});
+ADD_CASES(TC_CSVOut,
+          {{"name,iterations,real_time,cpu_time,time_unit,bytes_per_second,"
+            "items_per_second,label,error_occurred,error_message"}});
 
 // ========================================================================= //
 // ------------------------ Testing Basic Output --------------------------- //
 // ========================================================================= //
 
 void BM_basic(benchmark::State& state) {
-  while (state.KeepRunning()) {}
+  while (state.KeepRunning()) {
+  }
 }
 BENCHMARK(BM_basic);
 
-ADD_CASES(&ConsoleOutputTests, {
-    {"^BM_basic[ ]+[0-9]{1,5} ns[ ]+[0-9]{1,5} ns[ ]+[0-9]+$"}
-});
-ADD_CASES(&JSONOutputTests, {
-    {"\"name\": \"BM_basic\",$"},
-    {"\"iterations\": [0-9]+,$", MR_Next},
-    {"\"real_time\": [0-9]{1,5},$", MR_Next},
-    {"\"cpu_time\": [0-9]{1,5},$", MR_Next},
-    {"\"time_unit\": \"ns\"$", MR_Next},
-    {"}", MR_Next}
-});
-ADD_CASES(&CSVOutputTests, {
-    {"^\"BM_basic\",[0-9]+," + dec_re + "," + dec_re + ",ns,,,,,$"}
-});
+ADD_CASES(TC_ConsoleOut, {{"^BM_basic %console_report$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_basic\",$"},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %int,$", MR_Next},
+                       {"\"cpu_time\": %int,$", MR_Next},
+                       {"\"time_unit\": \"ns\"$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_basic\",%csv_report$"}});
+
+// ========================================================================= //
+// ------------------------ Testing Bytes per Second Output ---------------- //
+// ========================================================================= //
+
+void BM_bytes_per_second(benchmark::State& state) {
+  while (state.KeepRunning()) {
+  }
+  state.SetBytesProcessed(1);
+}
+BENCHMARK(BM_bytes_per_second);
+
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_bytes_per_second %console_report +%floatB/s$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_bytes_per_second\",$"},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %int,$", MR_Next},
+                       {"\"cpu_time\": %int,$", MR_Next},
+                       {"\"time_unit\": \"ns\",$", MR_Next},
+                       {"\"bytes_per_second\": %int$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_bytes_per_second\",%csv_bytes_report$"}});
+
+// ========================================================================= //
+// ------------------------ Testing Items per Second Output ---------------- //
+// ========================================================================= //
+
+void BM_items_per_second(benchmark::State& state) {
+  while (state.KeepRunning()) {
+  }
+  state.SetItemsProcessed(1);
+}
+BENCHMARK(BM_items_per_second);
+
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_items_per_second %console_report +%float items/s$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_items_per_second\",$"},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %int,$", MR_Next},
+                       {"\"cpu_time\": %int,$", MR_Next},
+                       {"\"time_unit\": \"ns\",$", MR_Next},
+                       {"\"items_per_second\": %int$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_items_per_second\",%csv_items_report$"}});
+
+// ========================================================================= //
+// ------------------------ Testing Label Output --------------------------- //
+// ========================================================================= //
+
+void BM_label(benchmark::State& state) {
+  while (state.KeepRunning()) {
+  }
+  state.SetLabel("some label");
+}
+BENCHMARK(BM_label);
+
+ADD_CASES(TC_ConsoleOut, {{"^BM_label %console_report some label$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_label\",$"},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %int,$", MR_Next},
+                       {"\"cpu_time\": %int,$", MR_Next},
+                       {"\"time_unit\": \"ns\",$", MR_Next},
+                       {"\"label\": \"some label\"$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_label\",%csv_label_report_begin\"some "
+                       "label\"%csv_label_report_end$"}});
 
 // ========================================================================= //
 // ------------------------ Testing Error Output --------------------------- //
 // ========================================================================= //
 
 void BM_error(benchmark::State& state) {
-    state.SkipWithError("message");
-    while(state.KeepRunning()) {}
+  state.SkipWithError("message");
+  while (state.KeepRunning()) {
+  }
 }
 BENCHMARK(BM_error);
-ADD_CASES(&ConsoleOutputTests, {
-    {"^BM_error[ ]+ERROR OCCURRED: 'message'$"}
-});
-ADD_CASES(&JSONOutputTests, {
-    {"\"name\": \"BM_error\",$"},
-    {"\"error_occurred\": true,$", MR_Next},
-    {"\"error_message\": \"message\",$", MR_Next}
-});
+ADD_CASES(TC_ConsoleOut, {{"^BM_error[ ]+ERROR OCCURRED: 'message'$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_error\",$"},
+                       {"\"error_occurred\": true,$", MR_Next},
+                       {"\"error_message\": \"message\",$", MR_Next}});
 
-ADD_CASES(&CSVOutputTests, {
-    {"^\"BM_error\",,,,,,,,true,\"message\"$"}
-});
+ADD_CASES(TC_CSVOut, {{"^\"BM_error\",,,,,,,,true,\"message\"$"}});
 
+// ========================================================================= //
+// ------------------------ Testing No Arg Name Output -----------------------
+// //
+// ========================================================================= //
+
+void BM_no_arg_name(benchmark::State& state) {
+  while (state.KeepRunning()) {
+  }
+}
+BENCHMARK(BM_no_arg_name)->Arg(3);
+ADD_CASES(TC_ConsoleOut, {{"^BM_no_arg_name/3 %console_report$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_no_arg_name/3\",$"}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_no_arg_name/3\",%csv_report$"}});
+
+// ========================================================================= //
+// ------------------------ Testing Arg Name Output ----------------------- //
+// ========================================================================= //
+
+void BM_arg_name(benchmark::State& state) {
+  while (state.KeepRunning()) {
+  }
+}
+BENCHMARK(BM_arg_name)->ArgName("first")->Arg(3);
+ADD_CASES(TC_ConsoleOut, {{"^BM_arg_name/first:3 %console_report$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_arg_name/first:3\",$"}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_arg_name/first:3\",%csv_report$"}});
+
+// ========================================================================= //
+// ------------------------ Testing Arg Names Output ----------------------- //
+// ========================================================================= //
+
+void BM_arg_names(benchmark::State& state) {
+  while (state.KeepRunning()) {
+  }
+}
+BENCHMARK(BM_arg_names)->Args({2, 5, 4})->ArgNames({"first", "", "third"});
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_arg_names/first:2/5/third:4 %console_report$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_arg_names/first:2/5/third:4\",$"}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_arg_names/first:2/5/third:4\",%csv_report$"}});
 
 // ========================================================================= //
 // ----------------------- Testing Complexity Output ----------------------- //
@@ -185,75 +165,92 @@
 void BM_Complexity_O1(benchmark::State& state) {
   while (state.KeepRunning()) {
   }
-  state.SetComplexityN(state.range_x());
+  state.SetComplexityN(state.range(0));
 }
-BENCHMARK(BM_Complexity_O1)->Range(1, 1<<18)->Complexity(benchmark::o1);
+BENCHMARK(BM_Complexity_O1)->Range(1, 1 << 18)->Complexity(benchmark::o1);
+SET_SUBSTITUTIONS({{"%bigOStr", "[ ]* %float \\([0-9]+\\)"},
+                   {"%RMS", "[ ]*[0-9]+ %"}});
+ADD_CASES(TC_ConsoleOut, {{"^BM_Complexity_O1_BigO %bigOStr %bigOStr[ ]*$"},
+                          {"^BM_Complexity_O1_RMS %RMS %RMS[ ]*$"}});
 
-std::string bigOStr = "[0-9]+\\.[0-9]+ \\([0-9]+\\)";
+// ========================================================================= //
+// ----------------------- Testing Aggregate Output ------------------------ //
+// ========================================================================= //
 
-ADD_CASES(&ConsoleOutputTests, {
-   {join("^BM_Complexity_O1_BigO", bigOStr, bigOStr) + "[ ]*$"},
-   {join("^BM_Complexity_O1_RMS", "[0-9]+ %", "[0-9]+ %") + "[ ]*$"}
-});
+// Test that non-aggregate data is printed by default
+void BM_Repeat(benchmark::State& state) {
+  while (state.KeepRunning()) {
+  }
+}
+BENCHMARK(BM_Repeat)->Repetitions(3);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Repeat/repeats:3 %console_report$"},
+                          {"^BM_Repeat/repeats:3 %console_report$"},
+                          {"^BM_Repeat/repeats:3 %console_report$"},
+                          {"^BM_Repeat/repeats:3_mean %console_report$"},
+                          {"^BM_Repeat/repeats:3_stddev %console_report$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Repeat/repeats:3\",$"},
+                       {"\"name\": \"BM_Repeat/repeats:3\",$"},
+                       {"\"name\": \"BM_Repeat/repeats:3\",$"},
+                       {"\"name\": \"BM_Repeat/repeats:3_mean\",$"},
+                       {"\"name\": \"BM_Repeat/repeats:3_stddev\",$"}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Repeat/repeats:3\",%csv_report$"},
+                      {"^\"BM_Repeat/repeats:3\",%csv_report$"},
+                      {"^\"BM_Repeat/repeats:3\",%csv_report$"},
+                      {"^\"BM_Repeat/repeats:3_mean\",%csv_report$"},
+                      {"^\"BM_Repeat/repeats:3_stddev\",%csv_report$"}});
 
+// Test that a non-repeated test still prints non-aggregate results even when
+// only-aggregate reports have been requested
+void BM_RepeatOnce(benchmark::State& state) {
+  while (state.KeepRunning()) {
+  }
+}
+BENCHMARK(BM_RepeatOnce)->Repetitions(1)->ReportAggregatesOnly();
+ADD_CASES(TC_ConsoleOut, {{"^BM_RepeatOnce/repeats:1 %console_report$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_RepeatOnce/repeats:1\",$"}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_RepeatOnce/repeats:1\",%csv_report$"}});
+
+// Test that non-aggregate data is not reported
+void BM_SummaryRepeat(benchmark::State& state) {
+  while (state.KeepRunning()) {
+  }
+}
+BENCHMARK(BM_SummaryRepeat)->Repetitions(3)->ReportAggregatesOnly();
+ADD_CASES(TC_ConsoleOut,
+          {{".*BM_SummaryRepeat/repeats:3 ", MR_Not},
+           {"^BM_SummaryRepeat/repeats:3_mean %console_report$"},
+           {"^BM_SummaryRepeat/repeats:3_stddev %console_report$"}});
+ADD_CASES(TC_JSONOut, {{".*BM_SummaryRepeat/repeats:3 ", MR_Not},
+                       {"\"name\": \"BM_SummaryRepeat/repeats:3_mean\",$"},
+                       {"\"name\": \"BM_SummaryRepeat/repeats:3_stddev\",$"}});
+ADD_CASES(TC_CSVOut, {{".*BM_SummaryRepeat/repeats:3 ", MR_Not},
+                      {"^\"BM_SummaryRepeat/repeats:3_mean\",%csv_report$"},
+                      {"^\"BM_SummaryRepeat/repeats:3_stddev\",%csv_report$"}});
+
+void BM_RepeatTimeUnit(benchmark::State& state) {
+  while (state.KeepRunning()) {
+  }
+}
+BENCHMARK(BM_RepeatTimeUnit)
+    ->Repetitions(3)
+    ->ReportAggregatesOnly()
+    ->Unit(benchmark::kMicrosecond);
+ADD_CASES(TC_ConsoleOut,
+          {{".*BM_RepeatTimeUnit/repeats:3 ", MR_Not},
+           {"^BM_RepeatTimeUnit/repeats:3_mean %console_us_report$"},
+           {"^BM_RepeatTimeUnit/repeats:3_stddev %console_us_report$"}});
+ADD_CASES(TC_JSONOut, {{".*BM_RepeatTimeUnit/repeats:3 ", MR_Not},
+                       {"\"name\": \"BM_RepeatTimeUnit/repeats:3_mean\",$"},
+                       {"\"time_unit\": \"us\",?$"},
+                       {"\"name\": \"BM_RepeatTimeUnit/repeats:3_stddev\",$"},
+                       {"\"time_unit\": \"us\",?$"}});
+ADD_CASES(TC_CSVOut,
+          {{".*BM_RepeatTimeUnit/repeats:3 ", MR_Not},
+           {"^\"BM_RepeatTimeUnit/repeats:3_mean\",%csv_us_report$"},
+           {"^\"BM_RepeatTimeUnit/repeats:3_stddev\",%csv_us_report$"}});
 
 // ========================================================================= //
 // --------------------------- TEST CASES END ------------------------------ //
 // ========================================================================= //
 
-
-int main(int argc, char* argv[]) {
-  // Add --color_print=false to argv since we don't want to match color codes.
-  char new_arg[64];
-  char* new_argv[64];
-  std::copy(argv, argv + argc, new_argv);
-  new_argv[argc++] = std::strcpy(new_arg, "--color_print=false");
-  benchmark::Initialize(&argc, new_argv);
-
-  benchmark::ConsoleReporter CR;
-  benchmark::JSONReporter JR;
-  benchmark::CSVReporter CSVR;
-  struct ReporterTest {
-    const char* name;
-    std::vector<TestCase>& output_cases;
-    std::vector<TestCase>& error_cases;
-    benchmark::BenchmarkReporter& reporter;
-    std::stringstream out_stream;
-    std::stringstream err_stream;
-
-    ReporterTest(const char* n,
-                 std::vector<TestCase>& out_tc,
-                 std::vector<TestCase>& err_tc,
-                 benchmark::BenchmarkReporter& br)
-        : name(n), output_cases(out_tc), error_cases(err_tc), reporter(br) {
-        reporter.SetOutputStream(&out_stream);
-        reporter.SetErrorStream(&err_stream);
-    }
-  } TestCases[] = {
-      {"ConsoleReporter", ConsoleOutputTests, ConsoleErrorTests, CR},
-      {"JSONReporter", JSONOutputTests, JSONErrorTests, JR},
-      {"CSVReporter", CSVOutputTests, CSVErrorTests, CSVR}
-  };
-
-  // Create the test reporter and run the benchmarks.
-  std::cout << "Running benchmarks...\n";
-  TestReporter test_rep({&CR, &JR, &CSVR});
-  benchmark::RunSpecifiedBenchmarks(&test_rep);
-
-  for (auto& rep_test : TestCases) {
-      std::string msg = std::string("\nTesting ") + rep_test.name + " Output\n";
-      std::string banner(msg.size() - 1, '-');
-      std::cout << banner << msg << banner << "\n";
-
-      std::cerr << rep_test.err_stream.str();
-      std::cout << rep_test.out_stream.str();
-
-      for (const auto& TC : rep_test.error_cases)
-        TC.Check(rep_test.err_stream);
-      for (const auto& TC : rep_test.output_cases)
-        TC.Check(rep_test.out_stream);
-
-      std::cout << "\n";
-  }
-  return 0;
-}
+int main(int argc, char* argv[]) { RunOutputTests(argc, argv); }
diff --git a/test/skip_with_error_test.cc b/test/skip_with_error_test.cc
index dafbd64..b74d33c 100644
--- a/test/skip_with_error_test.cc
+++ b/test/skip_with_error_test.cc
@@ -1,10 +1,11 @@
 
 #undef NDEBUG
-#include "benchmark/benchmark.h"
-#include "../src/check.h" // NOTE: check.h is for internal use only!
 #include <cassert>
 #include <vector>
 
+#include "../src/check.h"  // NOTE: check.h is for internal use only!
+#include "benchmark/benchmark.h"
+
 namespace {
 
 class TestReporter : public benchmark::ConsoleReporter {
@@ -18,7 +19,7 @@
     ConsoleReporter::ReportRuns(report);
   }
 
-  TestReporter()  {}
+  TestReporter() {}
   virtual ~TestReporter() {}
 
   mutable std::vector<Run> all_runs_;
@@ -32,11 +33,12 @@
   typedef benchmark::BenchmarkReporter::Run Run;
 
   void CheckRun(Run const& run) const {
-    CHECK(name == run.benchmark_name) << "expected " << name << " got " << run.benchmark_name;
+    CHECK(name == run.benchmark_name) << "expected " << name << " got "
+                                      << run.benchmark_name;
     CHECK(error_occurred == run.error_occurred);
     CHECK(error_message == run.error_message);
     if (error_occurred) {
-      //CHECK(run.iterations == 0);
+      // CHECK(run.iterations == 0);
     } else {
       CHECK(run.iterations != 0);
     }
@@ -55,12 +57,10 @@
 
 #define CONCAT(x, y) CONCAT2(x, y)
 #define CONCAT2(x, y) x##y
-#define ADD_CASES(...) \
-int CONCAT(dummy, __LINE__) = AddCases(__VA_ARGS__)
+#define ADD_CASES(...) int CONCAT(dummy, __LINE__) = AddCases(__VA_ARGS__)
 
 }  // end namespace
 
-
 void BM_error_before_running(benchmark::State& state) {
   state.SkipWithError("error message");
   while (state.KeepRunning()) {
@@ -68,13 +68,12 @@
   }
 }
 BENCHMARK(BM_error_before_running);
-ADD_CASES("BM_error_before_running",
-          {{"", true, "error message"}});
+ADD_CASES("BM_error_before_running", {{"", true, "error message"}});
 
 void BM_error_during_running(benchmark::State& state) {
   int first_iter = true;
   while (state.KeepRunning()) {
-    if (state.range_x() == 1 && state.thread_index <= (state.threads / 2)) {
+    if (state.range(0) == 1 && state.thread_index <= (state.threads / 2)) {
       assert(first_iter);
       first_iter = false;
       state.SkipWithError("error message");
@@ -85,17 +84,14 @@
   }
 }
 BENCHMARK(BM_error_during_running)->Arg(1)->Arg(2)->ThreadRange(1, 8);
-ADD_CASES(
-    "BM_error_during_running",
-    {{"/1/threads:1", true, "error message"},
-    {"/1/threads:2", true, "error message"},
-    {"/1/threads:4", true, "error message"},
-    {"/1/threads:8", true, "error message"},
-    {"/2/threads:1", false, ""},
-    {"/2/threads:2", false, ""},
-    {"/2/threads:4", false, ""},
-    {"/2/threads:8", false, ""}}
-);
+ADD_CASES("BM_error_during_running", {{"/1/threads:1", true, "error message"},
+                                      {"/1/threads:2", true, "error message"},
+                                      {"/1/threads:4", true, "error message"},
+                                      {"/1/threads:8", true, "error message"},
+                                      {"/2/threads:1", false, ""},
+                                      {"/2/threads:2", false, ""},
+                                      {"/2/threads:4", false, ""},
+                                      {"/2/threads:8", false, ""}});
 
 void BM_error_after_running(benchmark::State& state) {
   while (state.KeepRunning()) {
@@ -105,18 +101,15 @@
     state.SkipWithError("error message");
 }
 BENCHMARK(BM_error_after_running)->ThreadRange(1, 8);
-ADD_CASES(
-    "BM_error_after_running",
-    {{"/threads:1", true, "error message"},
-    {"/threads:2", true, "error message"},
-    {"/threads:4", true, "error message"},
-    {"/threads:8", true, "error message"}}
-);
+ADD_CASES("BM_error_after_running", {{"/threads:1", true, "error message"},
+                                     {"/threads:2", true, "error message"},
+                                     {"/threads:4", true, "error message"},
+                                     {"/threads:8", true, "error message"}});
 
 void BM_error_while_paused(benchmark::State& state) {
   bool first_iter = true;
   while (state.KeepRunning()) {
-    if (state.range_x() == 1 && state.thread_index <= (state.threads / 2)) {
+    if (state.range(0) == 1 && state.thread_index <= (state.threads / 2)) {
       assert(first_iter);
       first_iter = false;
       state.PauseTiming();
@@ -128,18 +121,14 @@
   }
 }
 BENCHMARK(BM_error_while_paused)->Arg(1)->Arg(2)->ThreadRange(1, 8);
-ADD_CASES(
-    "BM_error_while_paused",
-    {{"/1/threads:1", true, "error message"},
-    {"/1/threads:2", true, "error message"},
-    {"/1/threads:4", true, "error message"},
-    {"/1/threads:8", true, "error message"},
-    {"/2/threads:1", false, ""},
-    {"/2/threads:2", false, ""},
-    {"/2/threads:4", false, ""},
-    {"/2/threads:8", false, ""}}
-);
-
+ADD_CASES("BM_error_while_paused", {{"/1/threads:1", true, "error message"},
+                                    {"/1/threads:2", true, "error message"},
+                                    {"/1/threads:4", true, "error message"},
+                                    {"/1/threads:8", true, "error message"},
+                                    {"/2/threads:1", false, ""},
+                                    {"/2/threads:2", false, ""},
+                                    {"/2/threads:4", false, ""},
+                                    {"/2/threads:8", false, ""}});
 
 int main(int argc, char* argv[]) {
   benchmark::Initialize(&argc, argv);
diff --git a/tools/compare_bench.py b/tools/compare_bench.py
new file mode 100755
index 0000000..ed0f133
--- /dev/null
+++ b/tools/compare_bench.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+"""
+compare_bench.py - Compare two benchmarks or their results and report the
+                   difference.
+"""
+import sys
+import gbench
+from gbench import util, report
+
+def main():
+    # Parse the command line flags
+    def usage():
+        print('compare_bench.py <test1> <test2> [benchmark options]...')
+        exit(1)
+    if '--help' in sys.argv or len(sys.argv) < 3:
+        usage()
+    tests = sys.argv[1:3]
+    bench_opts = sys.argv[3:]
+    bench_opts = list(bench_opts)
+    # Run the benchmarks and report the results
+    json1 = gbench.util.run_or_load_benchmark(tests[0], bench_opts)
+    json2 = gbench.util.run_or_load_benchmark(tests[1], bench_opts)
+    output_lines = gbench.report.generate_difference_report(json1, json2)
+    print 'Comparing %s to %s' % (tests[0], tests[1])
+    for ln in output_lines:
+        print(ln)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/gbench/Inputs/test1_run1.json b/tools/gbench/Inputs/test1_run1.json
new file mode 100644
index 0000000..da9425e
--- /dev/null
+++ b/tools/gbench/Inputs/test1_run1.json
@@ -0,0 +1,46 @@
+{
+  "context": {
+    "date": "2016-08-02 17:44:46",
+    "num_cpus": 4,
+    "mhz_per_cpu": 4228,
+    "cpu_scaling_enabled": false,
+    "library_build_type": "release"
+  },
+  "benchmarks": [
+    {
+      "name": "BM_SameTimes",
+      "iterations": 1000,
+      "real_time": 10,
+      "cpu_time": 10,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_2xFaster",
+      "iterations": 1000,
+      "real_time": 50,
+      "cpu_time": 50,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_2xSlower",
+      "iterations": 1000,
+      "real_time": 50,
+      "cpu_time": 50,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_10PercentFaster",
+      "iterations": 1000,
+      "real_time": 100,
+      "cpu_time": 100,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_10PercentSlower",
+      "iterations": 1000,
+      "real_time": 100,
+      "cpu_time": 100,
+      "time_unit": "ns"
+    }
+  ]
+}
\ No newline at end of file
diff --git a/tools/gbench/Inputs/test1_run2.json b/tools/gbench/Inputs/test1_run2.json
new file mode 100644
index 0000000..d8bc72d
--- /dev/null
+++ b/tools/gbench/Inputs/test1_run2.json
@@ -0,0 +1,46 @@
+{
+  "context": {
+    "date": "2016-08-02 17:44:46",
+    "num_cpus": 4,
+    "mhz_per_cpu": 4228,
+    "cpu_scaling_enabled": false,
+    "library_build_type": "release"
+  },
+  "benchmarks": [
+    {
+      "name": "BM_SameTimes",
+      "iterations": 1000,
+      "real_time": 10,
+      "cpu_time": 10,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_2xFaster",
+      "iterations": 1000,
+      "real_time": 25,
+      "cpu_time": 25,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_2xSlower",
+      "iterations": 20833333,
+      "real_time": 100,
+      "cpu_time": 100,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_10PercentFaster",
+      "iterations": 1000,
+      "real_time": 90,
+      "cpu_time": 90,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_10PercentSlower",
+      "iterations": 1000,
+      "real_time": 110,
+      "cpu_time": 110,
+      "time_unit": "ns"
+    }
+  ]
+}
\ No newline at end of file
diff --git a/tools/gbench/__init__.py b/tools/gbench/__init__.py
new file mode 100644
index 0000000..fce1a1a
--- /dev/null
+++ b/tools/gbench/__init__.py
@@ -0,0 +1,8 @@
+"""Google Benchmark tooling"""
+
+__author__ = 'Eric Fiselier'
+__email__ = 'eric@efcs.ca'
+__versioninfo__ = (0, 5, 0)
+__version__ = '.'.join(str(v) for v in __versioninfo__) + 'dev'
+
+__all__ = []
diff --git a/tools/gbench/report.py b/tools/gbench/report.py
new file mode 100644
index 0000000..ac69b9b
--- /dev/null
+++ b/tools/gbench/report.py
@@ -0,0 +1,141 @@
+"""report.py - Utilities for reporting statistics about benchmark results
+"""
+import os
+
+class BenchmarkColor(object):
+    def __init__(self, name, code):
+        self.name = name
+        self.code = code
+
+    def __repr__(self):
+        return '%s%r' % (self.__class__.__name__,
+                         (self.name, self.code))
+
+    def __format__(self, format):
+        return self.code
+
+# Benchmark Colors Enumeration
+BC_NONE = BenchmarkColor('NONE', '')
+BC_MAGENTA = BenchmarkColor('MAGENTA', '\033[95m')
+BC_CYAN = BenchmarkColor('CYAN', '\033[96m')
+BC_OKBLUE = BenchmarkColor('OKBLUE', '\033[94m')
+BC_HEADER = BenchmarkColor('HEADER', '\033[92m')
+BC_WARNING = BenchmarkColor('WARNING', '\033[93m')
+BC_WHITE = BenchmarkColor('WHITE', '\033[97m')
+BC_FAIL = BenchmarkColor('FAIL', '\033[91m')
+BC_ENDC = BenchmarkColor('ENDC', '\033[0m')
+BC_BOLD = BenchmarkColor('BOLD', '\033[1m')
+BC_UNDERLINE = BenchmarkColor('UNDERLINE', '\033[4m')
+
+def color_format(use_color, fmt_str, *args, **kwargs):
+    """
+    Return the result of 'fmt_str.format(*args, **kwargs)' after transforming
+    'args' and 'kwargs' according to the value of 'use_color'. If 'use_color'
+    is False then all color codes in 'args' and 'kwargs' are replaced with
+    the empty string.
+    """
+    assert use_color is True or use_color is False
+    if not use_color:
+        args = [arg if not isinstance(arg, BenchmarkColor) else BC_NONE
+                for arg in args]
+        kwargs = {key: arg if not isinstance(arg, BenchmarkColor) else BC_NONE
+                  for key, arg in kwargs.items()}
+    return fmt_str.format(*args, **kwargs)
+
+
+def find_longest_name(benchmark_list):
+    """
+    Return the length of the longest benchmark name in a given list of
+    benchmark JSON objects
+    """
+    longest_name = 1
+    for bc in benchmark_list:
+        if len(bc['name']) > longest_name:
+            longest_name = len(bc['name'])
+    return longest_name
+
+
+def calculate_change(old_val, new_val):
+    """
+    Return a float representing the decimal change between old_val and new_val.
+    """
+    if old_val == 0 and new_val == 0:
+        return 0.0
+    if old_val == 0:
+        return float(new_val - old_val) / (float(old_val + new_val) / 2)
+    return float(new_val - old_val) / abs(old_val)
+
+
+def generate_difference_report(json1, json2, use_color=True):
+    """
+    Calculate and report the difference between each test of two benchmarks
+    runs specified as 'json1' and 'json2'.
+    """
+    first_col_width = find_longest_name(json1['benchmarks']) + 5
+    def find_test(name):
+        for b in json2['benchmarks']:
+            if b['name'] == name:
+                return b
+        return None
+    first_line = "{:<{}s}     Time           CPU           Old           New".format(
+        'Benchmark', first_col_width)
+    output_strs = [first_line, '-' * len(first_line)]
+    for bn in json1['benchmarks']:
+        other_bench = find_test(bn['name'])
+        if not other_bench:
+            continue
+
+        def get_color(res):
+            if res > 0.05:
+                return BC_FAIL
+            elif res > -0.07:
+                return BC_WHITE
+            else:
+                return BC_CYAN
+        fmt_str = "{}{:<{}s}{endc}    {}{:+.2f}{endc}         {}{:+.2f}{endc}         {:4d}         {:4d}"
+        tres = calculate_change(bn['real_time'], other_bench['real_time'])
+        cpures = calculate_change(bn['cpu_time'], other_bench['cpu_time'])
+        output_strs += [color_format(use_color, fmt_str,
+            BC_HEADER, bn['name'], first_col_width,
+            get_color(tres), tres, get_color(cpures), cpures,
+            bn['cpu_time'], other_bench['cpu_time'],
+            endc=BC_ENDC)]
+    return output_strs
+
+###############################################################################
+# Unit tests
+
+import unittest
+
+class TestReportDifference(unittest.TestCase):
+    def load_results(self):
+        import json
+        testInputs = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Inputs')
+        testOutput1 = os.path.join(testInputs, 'test1_run1.json')
+        testOutput2 = os.path.join(testInputs, 'test1_run2.json')
+        with open(testOutput1, 'r') as f:
+            json1 = json.load(f)
+        with open(testOutput2, 'r') as f:
+            json2 = json.load(f)
+        return json1, json2
+
+    def test_basic(self):
+        expect_lines = [
+            ['BM_SameTimes', '+0.00', '+0.00'],
+            ['BM_2xFaster', '-0.50', '-0.50'],
+            ['BM_2xSlower', '+1.00', '+1.00'],
+            ['BM_10PercentFaster', '-0.10', '-0.10'],
+            ['BM_10PercentSlower', '+0.10', '+0.10']
+        ]
+        json1, json2 = self.load_results()
+        output_lines = generate_difference_report(json1, json2, use_color=False)
+        print output_lines
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in xrange(0, len(output_lines)):
+            parts = [x for x in output_lines[i].split(' ') if x]
+            self.assertEqual(len(parts), 3)
+            self.assertEqual(parts, expect_lines[i])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tools/gbench/util.py b/tools/gbench/util.py
new file mode 100644
index 0000000..169b71c
--- /dev/null
+++ b/tools/gbench/util.py
@@ -0,0 +1,130 @@
+"""util.py - General utilities for running, loading, and processing benchmarks
+"""
+import json
+import os
+import tempfile
+import subprocess
+import sys
+
+# Input file type enumeration
+IT_Invalid    = 0
+IT_JSON       = 1
+IT_Executable = 2
+
+_num_magic_bytes = 2 if sys.platform.startswith('win') else 4
+def is_executable_file(filename):
+    """
+    Return 'True' if 'filename' names a valid file which is likely
+    an executable. A file is considered an executable if it starts with the
+    magic bytes for a EXE, Mach O, or ELF file.
+    """
+    if not os.path.isfile(filename):
+        return False
+    with open(filename, 'r') as f:
+        magic_bytes = f.read(_num_magic_bytes)
+    if sys.platform == 'darwin':
+        return magic_bytes in [
+            '\xfe\xed\xfa\xce',  # MH_MAGIC
+            '\xce\xfa\xed\xfe',  # MH_CIGAM
+            '\xfe\xed\xfa\xcf',  # MH_MAGIC_64
+            '\xcf\xfa\xed\xfe',  # MH_CIGAM_64
+            '\xca\xfe\xba\xbe',  # FAT_MAGIC
+            '\xbe\xba\xfe\xca'   # FAT_CIGAM
+        ]
+    elif sys.platform.startswith('win'):
+        return magic_bytes == 'MZ'
+    else:
+        return magic_bytes == '\x7FELF'
+
+
+def is_json_file(filename):
+    """
+    Returns 'True' if 'filename' names a valid JSON output file.
+    'False' otherwise.
+    """
+    try:
+        with open(filename, 'r') as f:
+            json.load(f)
+        return True
+    except:
+        pass
+    return False
+
+
+def classify_input_file(filename):
+    """
+    Return a tuple (type, msg) where 'type' specifies the classified type
+    of 'filename'. If 'type' is 'IT_Invalid' then 'msg' is a human readable
+    string represeting the error.
+    """
+    ftype = IT_Invalid
+    err_msg = None
+    if not os.path.exists(filename):
+        err_msg = "'%s' does not exist" % filename
+    elif not os.path.isfile(filename):
+        err_msg = "'%s' does not name a file" % filename
+    elif is_executable_file(filename):
+        ftype = IT_Executable
+    elif is_json_file(filename):
+        ftype = IT_JSON
+    else:
+        err_msg = "'%s' does not name a valid benchmark executable or JSON file"
+    return ftype, err_msg
+
+
+def check_input_file(filename):
+    """
+    Classify the file named by 'filename' and return the classification.
+    If the file is classified as 'IT_Invalid' print an error message and exit
+    the program.
+    """
+    ftype, msg = classify_input_file(filename)
+    if ftype == IT_Invalid:
+        print "Invalid input file: %s" % msg
+        sys.exit(1)
+    return ftype
+
+
+def load_benchmark_results(fname):
+    """
+    Read benchmark output from a file and return the JSON object.
+    REQUIRES: 'fname' names a file containing JSON benchmark output.
+    """
+    with open(fname, 'r') as f:
+        return json.load(f)
+
+
+def run_benchmark(exe_name, benchmark_flags):
+    """
+    Run a benchmark specified by 'exe_name' with the specified
+    'benchmark_flags'. The benchmark is run directly as a subprocess to preserve
+    real time console output.
+    RETURNS: A JSON object representing the benchmark output
+    """
+    thandle, tname = tempfile.mkstemp()
+    os.close(thandle)
+    cmd = [exe_name] + benchmark_flags
+    print("RUNNING: %s" % ' '.join(cmd))
+    exitCode = subprocess.call(cmd + ['--benchmark_out=%s' % tname])
+    if exitCode != 0:
+        print('TEST FAILED...')
+        sys.exit(exitCode)
+    json_res = load_benchmark_results(tname)
+    os.unlink(tname)
+    return json_res
+
+
+def run_or_load_benchmark(filename, benchmark_flags):
+    """
+    Get the results for a specified benchmark. If 'filename' specifies
+    an executable benchmark then the results are generated by running the
+    benchmark. Otherwise 'filename' must name a valid JSON output file,
+    which is loaded and the result returned.
+    """
+    ftype = check_input_file(filename)
+    if ftype == IT_JSON:
+        return load_benchmark_results(filename)
+    elif ftype == IT_Executable:
+        return run_benchmark(filename, benchmark_flags)
+    else:
+        assert False # This branch is unreachable
\ No newline at end of file