diff --git a/.gitignore b/.gitignore
index 3c1b4f2..bfc6b57 100644
--- a/.gitignore
+++ b/.gitignore
@@ -44,3 +44,7 @@
 # out-of-source build top-level folders.
 build/
 _build/
+
+# in-source dependancies
+/googletest/
+
diff --git a/.travis.yml b/.travis.yml
index 36df088..58be73f 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -129,6 +129,11 @@
       compiler: clang
       env:
         - COMPILER=clang++ BUILD_TYPE=Release
+    - os: osx
+      osx_image: xcode8.3
+      compiler: gcc
+      env:
+        - COMPILER=g++-7 C_COMPILER=gcc-7  BUILD_TYPE=Debug
 
 before_script:
   - if [ -z "$BUILD_32_BITS" ]; then
@@ -137,7 +142,7 @@
   - if [ -n "${LIBCXX_BUILD}" ]; then
       source .travis-libcxx-setup.sh;
     fi
-  - mkdir build && cd build
+  - mkdir -p build && cd build
 
 install:
   - if [ "${BUILD_TYPE}" == "Coverage" -a "${TRAVIS_OS_NAME}" == "linux" ]; then
@@ -145,9 +150,14 @@
       pip install --user --upgrade pip;
       pip install --user cpp-coveralls;
     fi
+  - if [ "${C_COMPILER}" == "gcc-7" -a "${TRAVIS_OS_NAME}" == "osx" ]; then
+      rm -f /usr/local/include/c++;
+      brew update;
+      brew install gcc@7;
+    fi
 
 script:
-  - cmake -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${COMPILER} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DCMAKE_CXX_FLAGS="${EXTRA_FLAGS}" -DBENCHMARK_BUILD_32_BITS=${BUILD_32_BITS} ..
+  - cmake -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${COMPILER} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DCMAKE_CXX_FLAGS="${EXTRA_FLAGS}" -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON -DBENCHMARK_BUILD_32_BITS=${BUILD_32_BITS} ..
   - make
   - ctest -C ${BUILD_TYPE} --output-on-failure
 
diff --git a/AUTHORS b/AUTHORS
index ae278df..4e4c4ed 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -10,9 +10,11 @@
 
 Albert Pretorius <pretoalb@gmail.com>
 Arne Beer <arne@twobeer.de>
+Carto
 Christopher Seymour <chris.j.seymour@hotmail.com>
 David Coeurjolly <david.coeurjolly@liris.cnrs.fr>
-Dominic Hamon <dma@stripysock.com>
+Dirac Research 
+Dominik Czarnota <dominik.b.czarnota@gmail.com>
 Eric Fiselier <eric@efcs.ca>
 Eugene Zhuk <eugene.zhuk@gmail.com>
 Evgeny Safronov <division494@gmail.com>
@@ -21,10 +23,11 @@
 International Business Machines Corporation
 Ismael Jimenez Martinez <ismael.jimenez.martinez@gmail.com>
 Jern-Kuan Leong <jernkuan@gmail.com>
-Joao Paulo Magalhaes <joaoppmagalhaes@gmail.com>
 JianXiong Zhou <zhoujianxiong2@gmail.com>
+Joao Paulo Magalhaes <joaoppmagalhaes@gmail.com>
 Jussi Knuuttila <jussi.knuuttila@gmail.com>
 Kaito Udagawa <umireon@gmail.com>
+Kishan Kumar <kumar.kishan@outlook.com>
 Lei Xu <eddyxu@gmail.com>
 Matt Clarkson <mattyclarkson@gmail.com>
 Maxim Vafin <maxvafin@gmail.com>
@@ -32,9 +35,9 @@
 Oleksandr Sochka <sasha.sochka@gmail.com>
 Paul Redmond <paul.redmond@gmail.com>
 Radoslav Yovchev <radoslav.tm@gmail.com>
+Roman Lebedev <lebedev.ri@gmail.com>
 Shuo Chen <chenshuo@chenshuo.com>
+Steinar H. Gunderson <sgunderson@bigfoot.com>
 Yixuan Qiu <yixuanq@gmail.com>
 Yusuke Suzuki <utatane.tea@gmail.com>
-Dirac Research 
 Zbigniew Skowron <zbychs@gmail.com>
-Dominik Czarnota <dominik.b.czarnota@gmail.com>
diff --git a/Android.bp b/Android.bp
index 8301356..4478fdf 100644
--- a/Android.bp
+++ b/Android.bp
@@ -41,6 +41,7 @@
         "src/json_reporter.cc",
         "src/reporter.cc",
         "src/sleep.cc",
+        "src/statistics.cc",
         "src/string_util.cc",
         "src/sysinfo.cc",
         "src/timers.cc",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f7f1566..aa08267 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,6 +5,7 @@
 foreach(p
     CMP0054 # CMake 3.1
     CMP0056 # export EXE_LINKER_FLAGS to try_run
+    CMP0057 # Support no if() IN_LIST operator
     )
   if(POLICY ${p})
     cmake_policy(SET ${p} NEW)
@@ -15,9 +16,19 @@
 option(BENCHMARK_ENABLE_EXCEPTIONS "Enable the use of exceptions in the benchmark library." ON)
 option(BENCHMARK_ENABLE_LTO "Enable link time optimisation of the benchmark library." OFF)
 option(BENCHMARK_USE_LIBCXX "Build and test using libc++ as the standard library." OFF)
-option(BENCHMARK_BUILD_32_BITS "Build a 32 bit version of the library" OFF)
+option(BENCHMARK_BUILD_32_BITS "Build a 32 bit version of the library." OFF)
+option(BENCHMARK_ENABLE_INSTALL "Enable installation of benchmark. (Projects embedding benchmark may want to turn this OFF.)" ON)
+
+# Allow unmet dependencies to be met using CMake's ExternalProject mechanics, which
+# may require downloading the source code.
+option(BENCHMARK_DOWNLOAD_DEPENDENCIES "Allow the downloading and in-tree building of unmet dependencies" OFF)
+
+# This option can be used to disable building and running unit tests which depend on gtest
+# in cases where it is not possible to build or find a valid version of gtest.
+option(BENCHMARK_ENABLE_GTEST_TESTS "Enable building the unit tests which depend on gtest" ON)
 
 # Make sure we can import out CMake functions
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules")
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 
 # Read the git tags to determine the project version
@@ -95,9 +106,7 @@
   if (NOT BENCHMARK_ENABLE_EXCEPTIONS)
     add_cxx_compiler_flag(-fno-exceptions)
   endif()
-  if (NOT BENCHMARK_USE_LIBCXX)
-    add_cxx_compiler_flag(-Wzero-as-null-pointer-constant)
-  endif()
+
   if (HAVE_CXX_FLAG_FSTRICT_ALIASING)
     if (NOT CMAKE_CXX_COMPILER_ID STREQUAL "Intel") #ICC17u2: Many false positives for Wstrict-aliasing
       add_cxx_compiler_flag(-Wstrict-aliasing)
@@ -131,28 +140,27 @@
       if (GCC_RANLIB)
         set(CMAKE_RANLIB ${GCC_RANLIB})
       endif()
+    elseif("${CMAKE_C_COMPILER_ID}" STREQUAL "Clang")
+      include(llvm-toolchain)
     endif()
   endif()
 
   # Coverage build type
-  set(CMAKE_CXX_FLAGS_COVERAGE "${CMAKE_CXX_FLAGS_DEBUG}" CACHE STRING
-    "Flags used by the C++ compiler during coverage builds."
+  set(BENCHMARK_CXX_FLAGS_COVERAGE "${CMAKE_CXX_FLAGS_DEBUG}"
+    CACHE STRING "Flags used by the C++ compiler during coverage builds."
     FORCE)
-  set(CMAKE_EXE_LINKER_FLAGS_COVERAGE
-    "${CMAKE_EXE_LINKER_FLAGS_DEBUG}" CACHE STRING
-    "Flags used for linking binaries during coverage builds."
+  set(BENCHMARK_EXE_LINKER_FLAGS_COVERAGE "${CMAKE_EXE_LINKER_FLAGS_DEBUG}"
+    CACHE STRING "Flags used for linking binaries during coverage builds."
     FORCE)
-  set(CMAKE_SHARED_LINKER_FLAGS_COVERAGE
-    "${CMAKE_SHARED_LINKER_FLAGS_DEBUG}" CACHE STRING
-    "Flags used by the shared libraries linker during coverage builds."
+  set(BENCHMARK_SHARED_LINKER_FLAGS_COVERAGE "${CMAKE_SHARED_LINKER_FLAGS_DEBUG}"
+    CACHE STRING "Flags used by the shared libraries linker during coverage builds."
     FORCE)
   mark_as_advanced(
-    CMAKE_CXX_FLAGS_COVERAGE
-    CMAKE_EXE_LINKER_FLAGS_COVERAGE
-    CMAKE_SHARED_LINKER_FLAGS_COVERAGE)
+    BENCHMARK_CXX_FLAGS_COVERAGE
+    BENCHMARK_EXE_LINKER_FLAGS_COVERAGE
+    BENCHMARK_SHARED_LINKER_FLAGS_COVERAGE)
   set(CMAKE_BUILD_TYPE "${CMAKE_BUILD_TYPE}" CACHE STRING
-    "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel Coverage."
-    FORCE)
+    "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel Coverage.")
   add_cxx_compiler_flag(--coverage COVERAGE)
 endif()
 
@@ -198,5 +206,8 @@
 
 if (BENCHMARK_ENABLE_TESTING)
   enable_testing()
+  if (BENCHMARK_ENABLE_GTEST_TESTS)
+    include(HandleGTest)
+  endif()
   add_subdirectory(test)
 endif()
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
index 9abb608..c59134b 100644
--- a/CONTRIBUTORS
+++ b/CONTRIBUTORS
@@ -28,18 +28,20 @@
 Chris Kennelly <ckennelly@google.com> <ckennelly@ckennelly.com>
 Christopher Seymour <chris.j.seymour@hotmail.com>
 David Coeurjolly <david.coeurjolly@liris.cnrs.fr>
-Dominic Hamon <dma@stripysock.com>
+Dominic Hamon <dma@stripysock.com> <dominic@google.com>
+Dominik Czarnota <dominik.b.czarnota@gmail.com>
 Eric Fiselier <eric@efcs.ca>
 Eugene Zhuk <eugene.zhuk@gmail.com>
 Evgeny Safronov <division494@gmail.com>
 Felix Homann <linuxaudio@showlabor.de>
 Ismael Jimenez Martinez <ismael.jimenez.martinez@gmail.com>
 Jern-Kuan Leong <jernkuan@gmail.com>
-Joao Paulo Magalhaes <joaoppmagalhaes@gmail.com>
 JianXiong Zhou <zhoujianxiong2@gmail.com>
+Joao Paulo Magalhaes <joaoppmagalhaes@gmail.com>
 Jussi Knuuttila <jussi.knuuttila@gmail.com>
-Kaito Udagawa <umireon@gmail.com>
 Kai Wolf <kai.wolf@gmail.com>
+Kishan Kumar <kumar.kishan@outlook.com>
+Kaito Udagawa <umireon@gmail.com>
 Lei Xu <eddyxu@gmail.com>
 Matt Clarkson <mattyclarkson@gmail.com>
 Maxim Vafin <maxvafin@gmail.com>
@@ -49,11 +51,12 @@
 Paul Redmond <paul.redmond@gmail.com>
 Pierre Phaneuf <pphaneuf@google.com>
 Radoslav Yovchev <radoslav.tm@gmail.com>
+Raul Marin <rmrodriguez@cartodb.com>
 Ray Glover <ray.glover@uk.ibm.com>
+Roman Lebedev <lebedev.ri@gmail.com>
 Shuo Chen <chenshuo@chenshuo.com>
+Tobias Ulvgård <tobias.ulvgard@dirac.se>
 Tom Madams <tom.ej.madams@gmail.com> <tmadams@google.com>
 Yixuan Qiu <yixuanq@gmail.com>
 Yusuke Suzuki <utatane.tea@gmail.com>
-Tobias Ulvgård <tobias.ulvgard@dirac.se>
 Zbigniew Skowron <zbychs@gmail.com>
-Dominik Czarnota <dominik.b.czarnota@gmail.com>
diff --git a/README.md b/README.md
index 2430d93..45e27ed 100644
--- a/README.md
+++ b/README.md
@@ -13,13 +13,85 @@
 
 [Additional Tooling Documentation](docs/tools.md)
 
+
+## Building
+
+The basic steps for configuring and building the library look like this:
+
+```bash
+$ git clone https://github.com/google/benchmark.git
+# Benchmark requires GTest as a dependency. Add the source tree as a subdirectory.
+$ git clone https://github.com/google/googletest.git benchmark/googletest
+$ mkdir build && cd build
+$ cmake -G <generator> [options] ../benchmark
+# Assuming a makefile generator was used
+$ make
+```
+
+Note that Google Benchmark requires GTest to build and run the tests. This
+dependency can be provided three ways:
+
+* Checkout the GTest sources into `benchmark/googletest`.
+* Otherwise, if `-DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON` is specified during
+  configuration, the library will automatically download and build any required
+  dependencies.
+* Otherwise, if nothing is done, CMake will use `find_package(GTest REQUIRED)`
+  to resolve the required GTest dependency.
+
+
+## Installation Guide
+
+For Ubuntu and Debian Based System
+
+First make sure you have git and cmake installed (If not please install it)
+
+```
+sudo apt-get install git
+sudo apt-get install cmake
+```
+
+Now, let's clone the repository and build it
+
+```
+git clone https://github.com/google/benchmark.git
+cd benchmark
+mkdir build
+cd build
+cmake .. -DCMAKE_BUILD_TYPE=RELEASE
+make
+```
+
+We need to install the library globally now
+
+```
+sudo make install
+```
+
+Now you have google/benchmark installed in your machine 
+Note: Don't forget to link to pthread library while building
+
+## Stable and Experimental Library Versions
+
+The main branch contains the latest stable version of the benchmarking library;
+the API of which can be considered largely stable, with source breaking changes
+being made only upon the release of a new major version.
+
+Newer, experimental, features are implemented and tested on the
+[`v2` branch](https://github.com/google/benchmark/tree/v2). Users who wish
+to use, test, and provide feedback on the new features are encouraged to try
+this branch. However, this branch provides no stability guarantees and reserves
+the right to change and break the API at any time.
+
+
 ## Example usage
 ### Basic usage
 Define a function that executes the code to be measured.
 
 ```c++
+#include <benchmark/benchmark.h>
+
 static void BM_StringCreation(benchmark::State& state) {
-  while (state.KeepRunning())
+  for (auto _ : state)
     std::string empty_string;
 }
 // Register the function as a benchmark
@@ -28,7 +100,7 @@
 // Define another benchmark
 static void BM_StringCopy(benchmark::State& state) {
   std::string x = "hello";
-  while (state.KeepRunning())
+  for (auto _ : state)
     std::string copy(x);
 }
 BENCHMARK(BM_StringCopy);
@@ -36,6 +108,10 @@
 BENCHMARK_MAIN();
 ```
 
+Don't forget to inform your linker to add benchmark library e.g. through `-lbenchmark` compilation flag.
+
+The benchmark library will reporting the timing for the code within the `for(...)` loop.
+
 ### Passing arguments
 Sometimes a family of benchmarks can be implemented with just one routine that
 takes an extra argument to specify which one of the family of benchmarks to
@@ -47,7 +123,7 @@
   char* src = new char[state.range(0)];
   char* dst = new char[state.range(0)];
   memset(src, 'x', state.range(0));
-  while (state.KeepRunning())
+  for (auto _ : state)
     memcpy(dst, src, state.range(0));
   state.SetBytesProcessed(int64_t(state.iterations()) *
                           int64_t(state.range(0)));
@@ -80,22 +156,23 @@
 
 ```c++
 static void BM_SetInsert(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  std::set<int> data;
+  for (auto _ : state) {
     state.PauseTiming();
-    std::set<int> data = ConstructRandomSet(state.range(0));
+    data = ConstructRandomSet(state.range(0));
     state.ResumeTiming();
     for (int j = 0; j < state.range(1); ++j)
       data.insert(RandomNumber());
   }
 }
 BENCHMARK(BM_SetInsert)
-    ->Args({1<<10, 1})
-    ->Args({1<<10, 8})
-    ->Args({1<<10, 64})
+    ->Args({1<<10, 128})
+    ->Args({2<<10, 128})
+    ->Args({4<<10, 128})
+    ->Args({8<<10, 128})
     ->Args({1<<10, 512})
-    ->Args({8<<10, 1})
-    ->Args({8<<10, 8})
-    ->Args({8<<10, 64})
+    ->Args({2<<10, 512})
+    ->Args({4<<10, 512})
     ->Args({8<<10, 512});
 ```
 
@@ -105,7 +182,7 @@
 pair.
 
 ```c++
-BENCHMARK(BM_SetInsert)->Ranges({{1<<10, 8<<10}, {1, 512}});
+BENCHMARK(BM_SetInsert)->Ranges({{1<<10, 8<<10}, {128, 512}});
 ```
 
 For more complex patterns of inputs, passing a custom function to `Apply` allows
@@ -131,7 +208,7 @@
 static void BM_StringCompare(benchmark::State& state) {
   std::string s1(state.range(0), '-');
   std::string s2(state.range(0), '-');
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
     benchmark::DoNotOptimize(s1.compare(s2));
   }
   state.SetComplexityN(state.range(0));
@@ -165,7 +242,7 @@
 template <class Q> int BM_Sequential(benchmark::State& state) {
   Q q;
   typename Q::value_type v;
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
     for (int i = state.range(0); i--; )
       q.push(v);
     for (int e = state.range(0); e--; )
@@ -181,7 +258,7 @@
 Three macros are provided for adding benchmark templates.
 
 ```c++
-#if __cplusplus >= 201103L // C++11 and greater.
+#ifdef BENCHMARK_HAS_CXX11
 #define BENCHMARK_TEMPLATE(func, ...) // Takes any number of parameters.
 #else // C++ < C++11
 #define BENCHMARK_TEMPLATE(func, arg1)
@@ -190,6 +267,62 @@
 #define BENCHMARK_TEMPLATE2(func, arg1, arg2)
 ```
 
+### A Faster KeepRunning loop
+
+In C++11 mode, a ranged-based for loop should be used in preference to
+the `KeepRunning` loop for running the benchmarks. For example:
+
+```c++
+static void BM_Fast(benchmark::State &state) {
+  for (auto _ : state) {
+    FastOperation();
+  }
+}
+BENCHMARK(BM_Fast);
+```
+
+The reason the ranged-for loop is faster than using `KeepRunning`, is
+because `KeepRunning` requires a memory load and store of the iteration count
+ever iteration, whereas the ranged-for variant is able to keep the iteration count
+in a register.
+
+For example, an empty inner loop of using the ranged-based for method looks like:
+
+```asm
+# Loop Init
+  mov rbx, qword ptr [r14 + 104]
+  call benchmark::State::StartKeepRunning()
+  test rbx, rbx
+  je .LoopEnd
+.LoopHeader: # =>This Inner Loop Header: Depth=1
+  add rbx, -1
+  jne .LoopHeader
+.LoopEnd:
+```
+
+Compared to an empty `KeepRunning` loop, which looks like:
+
+```asm
+.LoopHeader: # in Loop: Header=BB0_3 Depth=1
+  cmp byte ptr [rbx], 1
+  jne .LoopInit
+.LoopBody: # =>This Inner Loop Header: Depth=1
+  mov rax, qword ptr [rbx + 8]
+  lea rcx, [rax + 1]
+  mov qword ptr [rbx + 8], rcx
+  cmp rax, qword ptr [rbx + 104]
+  jb .LoopHeader
+  jmp .LoopEnd
+.LoopInit:
+  mov rdi, rbx
+  call benchmark::State::StartKeepRunning()
+  jmp .LoopBody
+.LoopEnd:
+```
+
+Unless C++03 compatibility is required, the ranged-for variant of writing
+the benchmark loop should be preferred.  
+
 ## Passing arbitrary arguments to a benchmark
 In C++11 it is possible to define a benchmark that takes an arbitrary number
 of extra arguments. The `BENCHMARK_CAPTURE(func, test_case_name, ...args)`
@@ -199,11 +332,11 @@
 should describe the values passed.
 
 ```c++
-template <class ...ExtraArgs>`
+template <class ...ExtraArgs>
 void BM_takes_args(benchmark::State& state, ExtraArgs&&... extra_args) {
   [...]
 }
-// Registers a benchmark named "BM_takes_args/int_string_test` that passes
+// Registers a benchmark named "BM_takes_args/int_string_test" that passes
 // the specified values to `extra_args`.
 BENCHMARK_CAPTURE(BM_takes_args, int_string_test, 42, std::string("abc"));
 ```
@@ -223,8 +356,7 @@
 benchmark tests to be registered programmatically.
 
 Additionally `RegisterBenchmark` allows any callable object to be registered
-as a benchmark. Including capturing lambdas and function objects. This
-allows the creation
+as a benchmark. Including capturing lambdas and function objects.
 
 For Example:
 ```c++
@@ -240,9 +372,10 @@
 
 ### Multithreaded benchmarks
 In a multithreaded test (benchmark invoked by multiple threads simultaneously),
-it is guaranteed that none of the threads will start until all have called
-`KeepRunning`, and all will have finished before KeepRunning returns false. As
-such, any global setup or teardown can be wrapped in a check against the thread
+it is guaranteed that none of the threads will start until all have reached
+the start of the benchmark loop, and all will have finished before any thread
+exits the benchmark loop. (This behavior is also provided by the `KeepRunning()`
+API) As such, any global setup or teardown can be wrapped in a check against the thread
 index:
 
 ```c++
@@ -250,7 +383,7 @@
   if (state.thread_index == 0) {
     // Setup code here.
   }
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
     // Run the test as normal.
   }
   if (state.thread_index == 0) {
@@ -274,10 +407,10 @@
 ## Manual timing
 For benchmarking something for which neither CPU time nor real-time are
 correct or accurate enough, completely manual timing is supported using
-the `UseManualTime` function. 
+the `UseManualTime` function.
 
 When `UseManualTime` is used, the benchmarked code must call
-`SetIterationTime` once per iteration of the `KeepRunning` loop to
+`SetIterationTime` once per iteration of the benchmark loop to
 report the manually measured time.
 
 An example use case for this is benchmarking GPU execution (e.g. OpenCL
@@ -293,7 +426,7 @@
     static_cast<double>(microseconds)
   };
 
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
     auto start = std::chrono::high_resolution_clock::now();
     // Simulate some useful workload with a sleep
     std::this_thread::sleep_for(sleep_duration);
@@ -316,7 +449,7 @@
 
 ```c++
 static void BM_test(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
       int x = 0;
       for (int i=0; i < 64; ++i) {
         benchmark::DoNotOptimize(x += i);
@@ -355,7 +488,7 @@
 
 ```c++
 static void BM_vector_push_back(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
     std::vector<int> v;
     v.reserve(1);
     benchmark::DoNotOptimize(v.data()); // Allow v.data() to be clobbered.
@@ -384,7 +517,7 @@
 set as a flag `--benchmark_min_time` or per benchmark by calling `MinTime` on
 the registered benchmark object.
 
-## Reporting the mean and standard devation by repeated benchmarks
+## Reporting the mean, median and standard deviation by repeated benchmarks
 By default each benchmark is run once and that single result is reported.
 However benchmarks are often noisy and a single result may not be representative
 of the overall behavior. For this reason it's possible to repeatedly rerun the
@@ -392,19 +525,42 @@
 
 The number of runs of each benchmark is specified globally by the
 `--benchmark_repetitions` flag or on a per benchmark basis by calling
-`Repetitions` on the registered benchmark object. When a benchmark is run
-more than once the mean and standard deviation of the runs will be reported.
+`Repetitions` on the registered benchmark object. When a benchmark is run more
+than once the mean, median and standard deviation of the runs will be reported.
 
 Additionally the `--benchmark_report_aggregates_only={true|false}` flag or
 `ReportAggregatesOnly(bool)` function can be used to change how repeated tests
 are reported. By default the result of each repeated run is reported. When this
-option is 'true' only the mean and standard deviation of the runs is reported.
+option is `true` only the mean, median and standard deviation of the runs is reported.
 Calling `ReportAggregatesOnly(bool)` on a registered benchmark object overrides
 the value of the flag for that benchmark.
 
+## User-defined statistics for repeated benchmarks
+While having mean, median and standard deviation is nice, this may not be
+enough for everyone. For example you may want to know what is the largest
+observation, e.g. because you have some real-time constraints. This is easy.
+The following code will specify a custom statistic to be calculated, defined
+by a lambda function.
+
+```c++
+void BM_spin_empty(benchmark::State& state) {
+  for (auto _ : state) {
+    for (int x = 0; x < state.range(0); ++x) {
+      benchmark::DoNotOptimize(x);
+    }
+  }
+}
+
+BENCHMARK(BM_spin_empty)
+  ->ComputeStatistics("max", [](const std::vector<double>& v) -> double {
+    return *(std::max_element(std::begin(v), std::end(v)));
+  })
+  ->Arg(512);
+```
+
 ## Fixtures
 Fixture tests are created by
-first defining a type that derives from ::benchmark::Fixture and then
+first defining a type that derives from `::benchmark::Fixture` and then
 creating/registering the tests using the following macros:
 
 * `BENCHMARK_F(ClassName, Method)`
@@ -417,13 +573,13 @@
 class MyFixture : public benchmark::Fixture {};
 
 BENCHMARK_F(MyFixture, FooTest)(benchmark::State& st) {
-   while (st.KeepRunning()) {
+   for (auto _ : st) {
      ...
   }
 }
 
 BENCHMARK_DEFINE_F(MyFixture, BarTest)(benchmark::State& st) {
-   while (st.KeepRunning()) {
+   for (auto _ : st) {
      ...
   }
 }
@@ -432,6 +588,31 @@
 /* BarTest is now registered */
 ```
 
+### Templated fixtures
+Also you can create templated fixture by using the following macros:
+
+* `BENCHMARK_TEMPLATE_F(ClassName, Method, ...)`
+* `BENCHMARK_TEMPLATE_DEFINE_F(ClassName, Method, ...)`
+
+For example:
+```c++
+template<typename T>
+class MyFixture : public benchmark::Fixture {};
+
+BENCHMARK_TEMPLATE_F(MyFixture, IntTest, int)(benchmark::State& st) {
+   for (auto _ : st) {
+     ...
+  }
+}
+
+BENCHMARK_TEMPLATE_DEFINE_F(MyFixture, DoubleTest, double)(benchmark::State& st) {
+   for (auto _ : st) {
+     ...
+  }
+}
+
+BENCHMARK_REGISTER_F(MyFixture, DoubleTest)->Threads(2);
+```
 
 ## User-defined counters
 
@@ -441,7 +622,7 @@
 ```c++
 static void UserCountersExample1(benchmark::State& state) {
   double numFoos = 0, numBars = 0, numBazs = 0;
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
     // ... count Foo,Bar,Baz events
   }
   state.counters["Foo"] = numFoos;
@@ -564,11 +745,12 @@
 communication, occur within a benchmark the
 `State::SkipWithError(const char* msg)` function can be used to skip that run
 of benchmark and report the error. Note that only future iterations of the
-`KeepRunning()` are skipped. Users may explicitly return to exit the
-benchmark immediately.
+`KeepRunning()` are skipped. For the ranged-for version of the benchmark loop
+Users must explicitly exit the loop, otherwise all iterations will be performed.
+Users may explicitly return to exit the benchmark immediately.
 
 The `SkipWithError(...)` function may be used at any point within the benchmark,
-including before and after the `KeepRunning()` loop.
+including before and after the benchmark loop.
 
 For example:
 
@@ -579,7 +761,7 @@
       state.SkipWithError("Resource is not good!");
       // KeepRunning() loop will not be entered.
   }
-  while (state.KeepRunning()) {
+  for (state.KeepRunning()) {
       auto data = resource.read_data();
       if (!resource.good()) {
         state.SkipWithError("Failed to read data!");
@@ -588,6 +770,14 @@
      do_stuff(data);
   }
 }
+
+static void BM_test_ranged_fo(benchmark::State & state) {
+  state.SkipWithError("test will not be entered");
+  for (auto _ : state) {
+    state.SkipWithError("Failed!");
+    break; // REQUIRED to prevent all further iterations.
+  }
+}
 ```
 
 ## Running a subset of the benchmarks
@@ -614,7 +804,7 @@
 is the default format.
 
 The Console format is intended to be a human readable format. By default
-the format generates color output. Context is output on stderr and the 
+the format generates color output. Context is output on stderr and the
 tabular data on stdout. Example tabular output looks like:
 ```
 Benchmark                               Time(ns)    CPU(ns) Iterations
@@ -695,6 +885,9 @@
 cmake -DCMAKE_BUILD_TYPE=Release -DBENCHMARK_ENABLE_LTO=true
 ```
 
+If you are using gcc, you might need to set `GCC_AR` and `GCC_RANLIB` cmake cache variables, if autodetection fails.
+If you are using clang, you may need to set `LLVMAR_EXECUTABLE`, `LLVMNM_EXECUTABLE` and `LLVMRANLIB_EXECUTABLE` cmake cache variables.
+
 ## Linking against the library
 When using gcc, it is necessary to link against pthread to avoid runtime exceptions.
 This is due to how gcc implements std::thread.
@@ -717,6 +910,18 @@
 Note: Using the library and its headers in C++03 is supported. C++11 is only
 required to build the library.
 
+## Disable CPU frequency scaling
+If you see this error:
+```
+***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
+```
+you might want to disable the CPU frequency scaling while running the benchmark:
+```bash
+sudo cpupower frequency-set --governor performance
+./mybench
+sudo cpupower frequency-set --governor powersave
+```
+
 # Known Issues
 
 ### Windows
diff --git a/appveyor.yml b/appveyor.yml
index e084f38..e99c6e7 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -43,7 +43,7 @@
   - md _build -Force
   - cd _build
   - echo %configuration%
-  - cmake -G "%generator%" "-DCMAKE_BUILD_TYPE=%configuration%" ..
+  - cmake -G "%generator%" "-DCMAKE_BUILD_TYPE=%configuration%" -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON ..
   - cmake --build . --config %configuration%
 
 test_script:
diff --git a/cmake/AddCXXCompilerFlag.cmake b/cmake/AddCXXCompilerFlag.cmake
index 0b176ba..17d5f3d 100644
--- a/cmake/AddCXXCompilerFlag.cmake
+++ b/cmake/AddCXXCompilerFlag.cmake
@@ -38,7 +38,7 @@
     if(ARGV1)
       string(TOUPPER "_${VARIANT}" VARIANT)
     endif()
-    set(CMAKE_CXX_FLAGS${VARIANT} "${CMAKE_CXX_FLAGS${VARIANT}} ${FLAG}" PARENT_SCOPE)
+    set(CMAKE_CXX_FLAGS${VARIANT} "${CMAKE_CXX_FLAGS${VARIANT}} ${BENCHMARK_CXX_FLAGS${VARIANT}} ${FLAG}" PARENT_SCOPE)
   endif()
 endfunction()
 
diff --git a/cmake/CXXFeatureCheck.cmake b/cmake/CXXFeatureCheck.cmake
index 2c4460f..b2a8217 100644
--- a/cmake/CXXFeatureCheck.cmake
+++ b/cmake/CXXFeatureCheck.cmake
@@ -22,18 +22,35 @@
   string(TOUPPER ${FILE} VAR)
   string(TOUPPER "HAVE_${VAR}" FEATURE)
   if (DEFINED HAVE_${VAR})
-    set(HAVE_${VAR} 1 CACHE INTERNAL "Feature test for ${FILE}" PARENT_SCOPE)
+    set(HAVE_${VAR} 1 PARENT_SCOPE)
     add_definitions(-DHAVE_${VAR})
     return()
   endif()
+
   message("-- Performing Test ${FEATURE}")
-  try_run(RUN_${FEATURE} COMPILE_${FEATURE}
-          ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${FILE}.cpp
-          CMAKE_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS}
-          LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES})
+  if(CMAKE_CROSSCOMPILING)
+    try_compile(COMPILE_${FEATURE}
+            ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${FILE}.cpp
+            CMAKE_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS}
+            LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES})
+    if(COMPILE_${FEATURE})
+      message(WARNING
+            "If you see build failures due to cross compilation, try setting HAVE_${VAR} to 0")
+      set(RUN_${FEATURE} 0)
+    else()
+      set(RUN_${FEATURE} 1)
+    endif()
+  else()
+    message("-- Performing Test ${FEATURE}")
+    try_run(RUN_${FEATURE} COMPILE_${FEATURE}
+            ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${FILE}.cpp
+            CMAKE_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS}
+            LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES})
+  endif()
+
   if(RUN_${FEATURE} EQUAL 0)
     message("-- Performing Test ${FEATURE} -- success")
-    set(HAVE_${VAR} 1 CACHE INTERNAL "Feature test for ${FILE}" PARENT_SCOPE)
+    set(HAVE_${VAR} 1 PARENT_SCOPE)
     add_definitions(-DHAVE_${VAR})
   else()
     if(NOT COMPILE_${FEATURE})
@@ -43,4 +60,3 @@
     endif()
   endif()
 endfunction()
-
diff --git a/cmake/HandleGTest.cmake b/cmake/HandleGTest.cmake
new file mode 100644
index 0000000..c84f653
--- /dev/null
+++ b/cmake/HandleGTest.cmake
@@ -0,0 +1,77 @@
+
+macro(split_list listname)
+  string(REPLACE ";" " " ${listname} "${${listname}}")
+endmacro()
+
+macro(build_external_gtest)
+  include(ExternalProject)
+  set(GTEST_FLAGS "")
+  if (BENCHMARK_USE_LIBCXX)
+    if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+      list(APPEND GTEST_FLAGS -stdlib=libc++)
+    else()
+      message(WARNING "Unsupported compiler (${CMAKE_CXX_COMPILER}) when using libc++")
+    endif()
+  endif()
+  if (BENCHMARK_BUILD_32_BITS)
+    list(APPEND GTEST_FLAGS -m32)
+  endif()
+  if (NOT "${CMAKE_CXX_FLAGS}" STREQUAL "")
+    list(APPEND GTEST_FLAGS ${CMAKE_CXX_FLAGS})
+  endif()
+  string(TOUPPER "${CMAKE_BUILD_TYPE}" GTEST_BUILD_TYPE)
+  if ("${GTEST_BUILD_TYPE}" STREQUAL "COVERAGE")
+    set(GTEST_BUILD_TYPE "DEBUG")
+  endif()
+  split_list(GTEST_FLAGS)
+  ExternalProject_Add(googletest
+      EXCLUDE_FROM_ALL ON
+      GIT_REPOSITORY https://github.com/google/googletest.git
+      GIT_TAG master
+      PREFIX "${CMAKE_BINARY_DIR}/googletest"
+      INSTALL_DIR "${CMAKE_BINARY_DIR}/googletest"
+      CMAKE_CACHE_ARGS
+        -DCMAKE_BUILD_TYPE:STRING=${GTEST_BUILD_TYPE}
+        -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER}
+        -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER}
+        -DCMAKE_INSTALL_PREFIX:PATH=<INSTALL_DIR>
+        -DCMAKE_CXX_FLAGS:STRING=${GTEST_FLAGS}
+        -Dgtest_force_shared_crt:BOOL=ON
+      )
+
+  ExternalProject_Get_Property(googletest install_dir)
+
+  add_library(gtest UNKNOWN IMPORTED)
+  add_library(gtest_main UNKNOWN IMPORTED)
+
+  set(LIB_SUFFIX "${CMAKE_STATIC_LIBRARY_SUFFIX}")
+  set(LIB_PREFIX "${CMAKE_STATIC_LIBRARY_PREFIX}")
+
+  if("${GTEST_BUILD_TYPE}" STREQUAL "DEBUG")
+    set(LIB_SUFFIX "d${CMAKE_STATIC_LIBRARY_SUFFIX}")
+  endif()
+  file(MAKE_DIRECTORY ${install_dir}/include)
+  set_target_properties(gtest PROPERTIES
+    IMPORTED_LOCATION ${install_dir}/lib/${LIB_PREFIX}gtest${LIB_SUFFIX}
+    INTERFACE_INCLUDE_DIRECTORIES ${install_dir}/include
+  )
+  set_target_properties(gtest_main PROPERTIES
+    IMPORTED_LOCATION ${install_dir}/lib/${LIB_PREFIX}gtest_main${LIB_SUFFIX}
+    INTERFACE_INCLUDE_DIRECTORIES ${install_dir}/include
+  )
+  add_dependencies(gtest googletest)
+  add_dependencies(gtest_main googletest)
+  set(GTEST_BOTH_LIBRARIES gtest gtest_main)
+  #set(GTEST_INCLUDE_DIRS ${install_dir}/include)
+endmacro(build_external_gtest)
+
+if (BENCHMARK_ENABLE_GTEST_TESTS)
+  if (IS_DIRECTORY ${CMAKE_SOURCE_DIR}/googletest)
+    add_subdirectory(${CMAKE_SOURCE_DIR}/googletest)
+    set(GTEST_BOTH_LIBRARIES gtest gtest_main)
+  elseif(BENCHMARK_DOWNLOAD_DEPENDENCIES)
+    build_external_gtest()
+  else()
+    find_package(GTest REQUIRED)
+  endif()
+endif()
diff --git a/cmake/Modules/FindLLVMAr.cmake b/cmake/Modules/FindLLVMAr.cmake
new file mode 100644
index 0000000..2346981
--- /dev/null
+++ b/cmake/Modules/FindLLVMAr.cmake
@@ -0,0 +1,16 @@
+include(FeatureSummary)
+
+find_program(LLVMAR_EXECUTABLE
+  NAMES llvm-ar
+  DOC "The llvm-ar executable"
+  )
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(LLVMAr
+  DEFAULT_MSG
+  LLVMAR_EXECUTABLE)
+
+SET_PACKAGE_PROPERTIES(LLVMAr PROPERTIES
+  URL https://llvm.org/docs/CommandGuide/llvm-ar.html
+  DESCRIPTION "create, modify, and extract from archives"
+)
diff --git a/cmake/Modules/FindLLVMNm.cmake b/cmake/Modules/FindLLVMNm.cmake
new file mode 100644
index 0000000..e56430a
--- /dev/null
+++ b/cmake/Modules/FindLLVMNm.cmake
@@ -0,0 +1,16 @@
+include(FeatureSummary)
+
+find_program(LLVMNM_EXECUTABLE
+  NAMES llvm-nm
+  DOC "The llvm-nm executable"
+  )
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(LLVMNm
+  DEFAULT_MSG
+  LLVMNM_EXECUTABLE)
+
+SET_PACKAGE_PROPERTIES(LLVMNm PROPERTIES
+  URL https://llvm.org/docs/CommandGuide/llvm-nm.html
+  DESCRIPTION "list LLVM bitcode and object file’s symbol table"
+)
diff --git a/cmake/Modules/FindLLVMRanLib.cmake b/cmake/Modules/FindLLVMRanLib.cmake
new file mode 100644
index 0000000..7b53e1a
--- /dev/null
+++ b/cmake/Modules/FindLLVMRanLib.cmake
@@ -0,0 +1,15 @@
+include(FeatureSummary)
+
+find_program(LLVMRANLIB_EXECUTABLE
+  NAMES llvm-ranlib
+  DOC "The llvm-ranlib executable"
+  )
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(LLVMRanLib
+  DEFAULT_MSG
+  LLVMRANLIB_EXECUTABLE)
+
+SET_PACKAGE_PROPERTIES(LLVMRanLib PROPERTIES
+  DESCRIPTION "generate index for LLVM archive"
+)
diff --git a/cmake/benchmark.pc.in b/cmake/benchmark.pc.in
new file mode 100644
index 0000000..1e84bff
--- /dev/null
+++ b/cmake/benchmark.pc.in
@@ -0,0 +1,11 @@
+prefix=@CMAKE_INSTALL_PREFIX@
+exec_prefix=${prefix}
+libdir=${prefix}/lib
+includedir=${prefix}/include
+
+Name: @PROJECT_NAME@
+Description: Google microbenchmark framework
+Version: @VERSION@
+
+Libs: -L${libdir} -lbenchmark
+Cflags: -I${includedir}
diff --git a/cmake/llvm-toolchain.cmake b/cmake/llvm-toolchain.cmake
new file mode 100644
index 0000000..fc119e5
--- /dev/null
+++ b/cmake/llvm-toolchain.cmake
@@ -0,0 +1,8 @@
+find_package(LLVMAr REQUIRED)
+set(CMAKE_AR "${LLVMAR_EXECUTABLE}" CACHE FILEPATH "" FORCE)
+
+find_package(LLVMNm REQUIRED)
+set(CMAKE_NM "${LLVMNM_EXECUTABLE}" CACHE FILEPATH "" FORCE)
+
+find_package(LLVMRanLib REQUIRED)
+set(CMAKE_RANLIB "${LLVMRANLIB_EXECUTABLE}" CACHE FILEPATH "" FORCE)
diff --git a/docs/tools.md b/docs/tools.md
index f176f74..70500bd 100644
--- a/docs/tools.md
+++ b/docs/tools.md
@@ -11,49 +11,232 @@
 
 Where `<old-benchmark>` and `<new-benchmark>` either specify a benchmark executable file, or a JSON output file. The type of the input file is automatically detected. If a benchmark executable is specified then the benchmark is run to obtain the results. Otherwise the results are simply loaded from the output file.
 
+`[benchmark options]` will be passed to the benchmarks invocations. They can be anything that binary accepts, be it either normal `--benchmark_*` parameters, or some custom parameters your binary takes.
+
 The sample output using the JSON test files under `Inputs/` gives:
 
 ``` bash
 $ ./compare_bench.py ./gbench/Inputs/test1_run1.json ./gbench/Inputs/test1_run2.json
 Comparing ./gbench/Inputs/test1_run1.json to ./gbench/Inputs/test1_run2.json
-Benchmark                   Time           CPU
-----------------------------------------------
-BM_SameTimes               +0.00         +0.00
-BM_2xFaster                -0.50         -0.50
-BM_2xSlower                +1.00         +1.00
-BM_10PercentFaster         -0.10         -0.10
-BM_10PercentSlower         +0.10         +0.10
+Benchmark                        Time             CPU      Time Old      Time New       CPU Old       CPU New
+-------------------------------------------------------------------------------------------------------------
+BM_SameTimes                  +0.0000         +0.0000            10            10            10            10
+BM_2xFaster                   -0.5000         -0.5000            50            25            50            25
+BM_2xSlower                   +1.0000         +1.0000            50           100            50           100
+BM_1PercentFaster             -0.0100         -0.0100           100            99           100            99
+BM_1PercentSlower             +0.0100         +0.0100           100           101           100           101
+BM_10PercentFaster            -0.1000         -0.1000           100            90           100            90
+BM_10PercentSlower            +0.1000         +0.1000           100           110           100           110
+BM_100xSlower                +99.0000        +99.0000           100         10000           100         10000
+BM_100xFaster                 -0.9900         -0.9900         10000           100         10000           100
+BM_10PercentCPUToTime         +0.1000         -0.1000           100           110           100            90
+BM_ThirdFaster                -0.3333         -0.3334           100            67           100            67
+BM_BadTimeUnit                -0.9000         +0.2000             0             0             0             1
 ```
 
+As you can note, the values in `Time` and `CPU` columns are calculated as `(new - old) / |old|`.
+
 When a benchmark executable is run, the raw output from the benchmark is printed in real time to stdout. The sample output using `benchmark/basic_test` for both arguments looks like:
 
 ```
 ./compare_bench.py  test/basic_test test/basic_test  --benchmark_filter=BM_empty.*
-RUNNING: test/basic_test --benchmark_filter=BM_empty.*
-Run on (4 X 4228.32 MHz CPU s)
-2016-08-02 19:21:33
+RUNNING: test/basic_test --benchmark_filter=BM_empty.* --benchmark_out=/tmp/tmpN7LF3a
+Run on (8 X 4000 MHz CPU s)
+2017-11-07 23:28:36
+---------------------------------------------------------------------
 Benchmark                              Time           CPU Iterations
---------------------------------------------------------------------
-BM_empty                               9 ns          9 ns   79545455
-BM_empty/threads:4                     4 ns          9 ns   75268816
-BM_empty_stop_start                    8 ns          8 ns   83333333
-BM_empty_stop_start/threads:4          3 ns          8 ns   83333332
-RUNNING: test/basic_test --benchmark_filter=BM_empty.*
-Run on (4 X 4228.32 MHz CPU s)
-2016-08-02 19:21:35
+---------------------------------------------------------------------
+BM_empty                               4 ns          4 ns  170178757
+BM_empty/threads:8                     1 ns          7 ns  103868920
+BM_empty_stop_start                    0 ns          0 ns 1000000000
+BM_empty_stop_start/threads:8          0 ns          0 ns 1403031720
+RUNNING: /test/basic_test --benchmark_filter=BM_empty.* --benchmark_out=/tmp/tmplvrIp8
+Run on (8 X 4000 MHz CPU s)
+2017-11-07 23:28:38
+---------------------------------------------------------------------
 Benchmark                              Time           CPU Iterations
---------------------------------------------------------------------
-BM_empty                               9 ns          9 ns   76086957
-BM_empty/threads:4                     4 ns          9 ns   76086956
-BM_empty_stop_start                    8 ns          8 ns   87500000
-BM_empty_stop_start/threads:4          3 ns          8 ns   88607596
-Comparing test/basic_test to test/basic_test
-Benchmark                              Time           CPU
----------------------------------------------------------
-BM_empty                              +0.00         +0.00
-BM_empty/threads:4                    +0.00         +0.00
-BM_empty_stop_start                   +0.00         +0.00
-BM_empty_stop_start/threads:4         +0.00         +0.00
+---------------------------------------------------------------------
+BM_empty                               4 ns          4 ns  169534855
+BM_empty/threads:8                     1 ns          7 ns  104188776
+BM_empty_stop_start                    0 ns          0 ns 1000000000
+BM_empty_stop_start/threads:8          0 ns          0 ns 1404159424
+Comparing ../build/test/basic_test to ../build/test/basic_test
+Benchmark                                Time             CPU      Time Old      Time New       CPU Old       CPU New
+---------------------------------------------------------------------------------------------------------------------
+BM_empty                              -0.0048         -0.0049             4             4             4             4
+BM_empty/threads:8                    -0.0123         -0.0054             1             1             7             7
+BM_empty_stop_start                   -0.0000         -0.0000             0             0             0             0
+BM_empty_stop_start/threads:8         -0.0029         +0.0001             0             0             0             0
+
 ```
 
+As you can note, the values in `Time` and `CPU` columns are calculated as `(new - old) / |old|`.
 Obviously this example doesn't give any useful output, but it's intended to show the output format when 'compare_bench.py' needs to run benchmarks.
+
+## compare.py
+
+The `compare.py` can be used to compare the result of benchmarks.
+There are three modes of operation:
+
+1. Just compare two benchmarks, what `compare_bench.py` did.
+The program is invoked like:
+
+``` bash
+$ compare.py benchmarks <benchmark_baseline> <benchmark_contender> [benchmark options]...
+```
+Where `<benchmark_baseline>` and `<benchmark_contender>` either specify a benchmark executable file, or a JSON output file. The type of the input file is automatically detected. If a benchmark executable is specified then the benchmark is run to obtain the results. Otherwise the results are simply loaded from the output file.
+
+`[benchmark options]` will be passed to the benchmarks invocations. They can be anything that binary accepts, be it either normal `--benchmark_*` parameters, or some custom parameters your binary takes.
+
+Example output:
+```
+$ ./compare.py benchmarks ./a.out ./a.out
+RUNNING: ./a.out --benchmark_out=/tmp/tmprBT5nW
+Run on (8 X 4000 MHz CPU s)
+2017-11-07 21:16:44
+------------------------------------------------------
+Benchmark               Time           CPU Iterations
+------------------------------------------------------
+BM_memcpy/8            36 ns         36 ns   19101577   211.669MB/s
+BM_memcpy/64           76 ns         76 ns    9412571   800.199MB/s
+BM_memcpy/512          84 ns         84 ns    8249070   5.64771GB/s
+BM_memcpy/1024        116 ns        116 ns    6181763   8.19505GB/s
+BM_memcpy/8192        643 ns        643 ns    1062855   11.8636GB/s
+BM_copy/8             222 ns        222 ns    3137987   34.3772MB/s
+BM_copy/64           1608 ns       1608 ns     432758   37.9501MB/s
+BM_copy/512         12589 ns      12589 ns      54806   38.7867MB/s
+BM_copy/1024        25169 ns      25169 ns      27713   38.8003MB/s
+BM_copy/8192       201165 ns     201112 ns       3486   38.8466MB/s
+RUNNING: ./a.out --benchmark_out=/tmp/tmpt1wwG_
+Run on (8 X 4000 MHz CPU s)
+2017-11-07 21:16:53
+------------------------------------------------------
+Benchmark               Time           CPU Iterations
+------------------------------------------------------
+BM_memcpy/8            36 ns         36 ns   19397903   211.255MB/s
+BM_memcpy/64           73 ns         73 ns    9691174   839.635MB/s
+BM_memcpy/512          85 ns         85 ns    8312329   5.60101GB/s
+BM_memcpy/1024        118 ns        118 ns    6438774   8.11608GB/s
+BM_memcpy/8192        656 ns        656 ns    1068644   11.6277GB/s
+BM_copy/8             223 ns        223 ns    3146977   34.2338MB/s
+BM_copy/64           1611 ns       1611 ns     435340   37.8751MB/s
+BM_copy/512         12622 ns      12622 ns      54818   38.6844MB/s
+BM_copy/1024        25257 ns      25239 ns      27779   38.6927MB/s
+BM_copy/8192       205013 ns     205010 ns       3479    38.108MB/s
+Comparing ./a.out to ./a.out
+Benchmark                 Time             CPU      Time Old      Time New       CPU Old       CPU New
+------------------------------------------------------------------------------------------------------
+BM_memcpy/8            +0.0020         +0.0020            36            36            36            36
+BM_memcpy/64           -0.0468         -0.0470            76            73            76            73
+BM_memcpy/512          +0.0081         +0.0083            84            85            84            85
+BM_memcpy/1024         +0.0098         +0.0097           116           118           116           118
+BM_memcpy/8192         +0.0200         +0.0203           643           656           643           656
+BM_copy/8              +0.0046         +0.0042           222           223           222           223
+BM_copy/64             +0.0020         +0.0020          1608          1611          1608          1611
+BM_copy/512            +0.0027         +0.0026         12589         12622         12589         12622
+BM_copy/1024           +0.0035         +0.0028         25169         25257         25169         25239
+BM_copy/8192           +0.0191         +0.0194        201165        205013        201112        205010
+```
+
+What it does is for the every benchmark from the first run it looks for the benchmark with exactly the same name in the second run, and then compares the results. If the names differ, the benchmark is omitted from the diff.
+As you can note, the values in `Time` and `CPU` columns are calculated as `(new - old) / |old|`.
+
+2. Compare two different filters of one benchmark
+The program is invoked like:
+
+``` bash
+$ compare.py filters <benchmark> <filter_baseline> <filter_contender> [benchmark options]...
+```
+Where `<benchmark>` either specify a benchmark executable file, or a JSON output file. The type of the input file is automatically detected. If a benchmark executable is specified then the benchmark is run to obtain the results. Otherwise the results are simply loaded from the output file.
+
+Where `<filter_baseline>` and `<filter_contender>` are the same regex filters that you would pass to the `[--benchmark_filter=<regex>]` parameter of the benchmark binary.
+
+`[benchmark options]` will be passed to the benchmarks invocations. They can be anything that binary accepts, be it either normal `--benchmark_*` parameters, or some custom parameters your binary takes.
+
+Example output:
+```
+$ ./compare.py filters ./a.out BM_memcpy BM_copy
+RUNNING: ./a.out --benchmark_filter=BM_memcpy --benchmark_out=/tmp/tmpBWKk0k
+Run on (8 X 4000 MHz CPU s)
+2017-11-07 21:37:28
+------------------------------------------------------
+Benchmark               Time           CPU Iterations
+------------------------------------------------------
+BM_memcpy/8            36 ns         36 ns   17891491   211.215MB/s
+BM_memcpy/64           74 ns         74 ns    9400999   825.646MB/s
+BM_memcpy/512          87 ns         87 ns    8027453   5.46126GB/s
+BM_memcpy/1024        111 ns        111 ns    6116853    8.5648GB/s
+BM_memcpy/8192        657 ns        656 ns    1064679   11.6247GB/s
+RUNNING: ./a.out --benchmark_filter=BM_copy --benchmark_out=/tmp/tmpAvWcOM
+Run on (8 X 4000 MHz CPU s)
+2017-11-07 21:37:33
+----------------------------------------------------
+Benchmark             Time           CPU Iterations
+----------------------------------------------------
+BM_copy/8           227 ns        227 ns    3038700   33.6264MB/s
+BM_copy/64         1640 ns       1640 ns     426893   37.2154MB/s
+BM_copy/512       12804 ns      12801 ns      55417   38.1444MB/s
+BM_copy/1024      25409 ns      25407 ns      27516   38.4365MB/s
+BM_copy/8192     202986 ns     202990 ns       3454   38.4871MB/s
+Comparing BM_memcpy to BM_copy (from ./a.out)
+Benchmark                               Time             CPU      Time Old      Time New       CPU Old       CPU New
+--------------------------------------------------------------------------------------------------------------------
+[BM_memcpy vs. BM_copy]/8            +5.2829         +5.2812            36           227            36           227
+[BM_memcpy vs. BM_copy]/64          +21.1719        +21.1856            74          1640            74          1640
+[BM_memcpy vs. BM_copy]/512        +145.6487       +145.6097            87         12804            87         12801
+[BM_memcpy vs. BM_copy]/1024       +227.1860       +227.1776           111         25409           111         25407
+[BM_memcpy vs. BM_copy]/8192       +308.1664       +308.2898           657        202986           656        202990
+```
+
+As you can see, it applies filter to the benchmarks, both when running the benchmark, and before doing the diff. And to make the diff work, the matches are replaced with some common string. Thus, you can compare two different benchmark families within one benchmark binary.
+As you can note, the values in `Time` and `CPU` columns are calculated as `(new - old) / |old|`.
+
+3. Compare filter one from benchmark one to filter two from benchmark two:
+The program is invoked like:
+
+``` bash
+$ compare.py filters <benchmark_baseline> <filter_baseline> <benchmark_contender> <filter_contender> [benchmark options]...
+```
+
+Where `<benchmark_baseline>` and `<benchmark_contender>` either specify a benchmark executable file, or a JSON output file. The type of the input file is automatically detected. If a benchmark executable is specified then the benchmark is run to obtain the results. Otherwise the results are simply loaded from the output file.
+
+Where `<filter_baseline>` and `<filter_contender>` are the same regex filters that you would pass to the `[--benchmark_filter=<regex>]` parameter of the benchmark binary.
+
+`[benchmark options]` will be passed to the benchmarks invocations. They can be anything that binary accepts, be it either normal `--benchmark_*` parameters, or some custom parameters your binary takes.
+
+Example output:
+```
+$ ./compare.py benchmarksfiltered ./a.out BM_memcpy ./a.out BM_copy
+RUNNING: ./a.out --benchmark_filter=BM_memcpy --benchmark_out=/tmp/tmp_FvbYg
+Run on (8 X 4000 MHz CPU s)
+2017-11-07 21:38:27
+------------------------------------------------------
+Benchmark               Time           CPU Iterations
+------------------------------------------------------
+BM_memcpy/8            37 ns         37 ns   18953482   204.118MB/s
+BM_memcpy/64           74 ns         74 ns    9206578   828.245MB/s
+BM_memcpy/512          91 ns         91 ns    8086195   5.25476GB/s
+BM_memcpy/1024        120 ns        120 ns    5804513   7.95662GB/s
+BM_memcpy/8192        664 ns        664 ns    1028363   11.4948GB/s
+RUNNING: ./a.out --benchmark_filter=BM_copy --benchmark_out=/tmp/tmpDfL5iE
+Run on (8 X 4000 MHz CPU s)
+2017-11-07 21:38:32
+----------------------------------------------------
+Benchmark             Time           CPU Iterations
+----------------------------------------------------
+BM_copy/8           230 ns        230 ns    2985909   33.1161MB/s
+BM_copy/64         1654 ns       1653 ns     419408   36.9137MB/s
+BM_copy/512       13122 ns      13120 ns      53403   37.2156MB/s
+BM_copy/1024      26679 ns      26666 ns      26575   36.6218MB/s
+BM_copy/8192     215068 ns     215053 ns       3221   36.3283MB/s
+Comparing BM_memcpy (from ./a.out) to BM_copy (from ./a.out)
+Benchmark                               Time             CPU      Time Old      Time New       CPU Old       CPU New
+--------------------------------------------------------------------------------------------------------------------
+[BM_memcpy vs. BM_copy]/8            +5.1649         +5.1637            37           230            37           230
+[BM_memcpy vs. BM_copy]/64          +21.4352        +21.4374            74          1654            74          1653
+[BM_memcpy vs. BM_copy]/512        +143.6022       +143.5865            91         13122            91         13120
+[BM_memcpy vs. BM_copy]/1024       +221.5903       +221.4790           120         26679           120         26666
+[BM_memcpy vs. BM_copy]/8192       +322.9059       +323.0096           664        215068           664        215053
+```
+This is a mix of the previous two modes, two (potentially different) benchmark binaries are run, and a different filter is applied to each one.
+As you can note, the values in `Time` and `CPU` columns are calculated as `(new - old) / |old|`.
diff --git a/include/benchmark/benchmark.h b/include/benchmark/benchmark.h
index bd3b0ff..340cbc1 100644
--- a/include/benchmark/benchmark.h
+++ b/include/benchmark/benchmark.h
@@ -18,7 +18,7 @@
 // Define a function that executes the code to be measured a
 // specified number of times:
 static void BM_StringCreation(benchmark::State& state) {
-  while (state.KeepRunning())
+  for (auto _ : state)
     std::string empty_string;
 }
 
@@ -28,7 +28,7 @@
 // Define another benchmark
 static void BM_StringCopy(benchmark::State& state) {
   std::string x = "hello";
-  while (state.KeepRunning())
+  for (auto _ : state)
     std::string copy(x);
 }
 BENCHMARK(BM_StringCopy);
@@ -54,7 +54,7 @@
 static void BM_memcpy(benchmark::State& state) {
   char* src = new char[state.range(0)]; char* dst = new char[state.range(0)];
   memset(src, 'x', state.range(0));
-  while (state.KeepRunning())
+  for (auto _ : state)
     memcpy(dst, src, state.range(0));
   state.SetBytesProcessed(int64_t(state.iterations()) *
                           int64_t(state.range(0)));
@@ -72,29 +72,30 @@
 // example, the following code defines a family of microbenchmarks for
 // measuring the speed of set insertion.
 static void BM_SetInsert(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  set<int> data;
+  for (auto _ : state) {
     state.PauseTiming();
-    set<int> data = ConstructRandomSet(state.range(0));
+    data = ConstructRandomSet(state.range(0));
     state.ResumeTiming();
     for (int j = 0; j < state.range(1); ++j)
       data.insert(RandomNumber());
   }
 }
 BENCHMARK(BM_SetInsert)
-   ->Args({1<<10, 1})
-   ->Args({1<<10, 8})
-   ->Args({1<<10, 64})
+   ->Args({1<<10, 128})
+   ->Args({2<<10, 128})
+   ->Args({4<<10, 128})
+   ->Args({8<<10, 128})
    ->Args({1<<10, 512})
-   ->Args({8<<10, 1})
-   ->Args({8<<10, 8})
-   ->Args({8<<10, 64})
+   ->Args({2<<10, 512})
+   ->Args({4<<10, 512})
    ->Args({8<<10, 512});
 
 // The preceding code is quite repetitive, and can be replaced with
 // the following short-hand.  The following macro will pick a few
 // appropriate arguments in the product of the two specified ranges
 // and will generate a microbenchmark for each such pair.
-BENCHMARK(BM_SetInsert)->Ranges({{1<<10, 8<<10}, {1, 512}});
+BENCHMARK(BM_SetInsert)->Ranges({{1<<10, 8<<10}, {128, 512}});
 
 // For more complex patterns of inputs, passing a custom function
 // to Apply allows programmatic specification of an
@@ -114,7 +115,7 @@
 template <class Q> int BM_Sequential(benchmark::State& state) {
   Q q;
   typename Q::value_type v;
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
     for (int i = state.range(0); i--; )
       q.push(v);
     for (int e = state.range(0); e--; )
@@ -135,15 +136,15 @@
 BENCHMARK(BM_test)->MinTime(2.0); // Run for at least 2 seconds.
 
 In a multithreaded test, it is guaranteed that none of the threads will start
-until all have called KeepRunning, and all will have finished before KeepRunning
-returns false. As such, any global setup or teardown you want to do can be
-wrapped in a check against the thread index:
+until all have reached the loop start, and all will have finished before any
+thread exits the loop body. As such, any global setup or teardown you want to
+do can be wrapped in a check against the thread index:
 
 static void BM_MultiThreaded(benchmark::State& state) {
   if (state.thread_index == 0) {
     // Setup code here.
   }
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
     // Run the test as normal.
   }
   if (state.thread_index == 0) {
@@ -164,7 +165,8 @@
 #define BENCHMARK_BENCHMARK_H_
 
 
-#if __cplusplus >= 201103L
+// The _MSVC_LANG check should detect Visual Studio 2015 Update 3 and newer.
+#if __cplusplus >= 201103L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201103L)
 #define BENCHMARK_HAS_CXX11
 #endif
 
@@ -237,7 +239,6 @@
 #define BENCHMARK_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
 #endif
 
-
 namespace benchmark {
 class BenchmarkReporter;
 
@@ -289,10 +290,12 @@
 }  // namespace internal
 
 
-#if !defined(__GNUC__) || defined(__pnacl__) || defined(EMSCRIPTN)
+#if (!defined(__GNUC__) && !defined(__clang__)) || defined(__pnacl__) || \
+    defined(EMSCRIPTN)
 # define BENCHMARK_HAS_NO_INLINE_ASSEMBLY
 #endif
 
+
 // The DoNotOptimize(...) function can be used to prevent a value or
 // expression from being optimized away by the compiler. This function is
 // intended to add little to no overhead.
@@ -378,6 +381,18 @@
 // computational complexity for the benchmark.
 typedef double(BigOFunc)(int);
 
+// StatisticsFunc is passed to a benchmark in order to compute some descriptive
+// statistics over all the measurements of some type
+typedef double(StatisticsFunc)(const std::vector<double>&);
+
+struct Statistics {
+  std::string name_;
+  StatisticsFunc* compute_;
+
+  Statistics(std::string name, StatisticsFunc* compute)
+    : name_(name), compute_(compute) {}
+};
+
 namespace internal {
 class ThreadTimer;
 class ThreadManager;
@@ -398,6 +413,19 @@
 // benchmark to use.
 class State {
  public:
+  struct StateIterator;
+  friend struct StateIterator;
+
+  // Returns iterators used to run each iteration of a benchmark using a
+  // C++11 ranged-based for loop. These functions should not be called directly.
+  //
+  // REQUIRES: The benchmark has not started running yet. Neither begin nor end
+  // have been called previously.
+  //
+  // NOTE: KeepRunning may not be used after calling either of these functions.
+  BENCHMARK_ALWAYS_INLINE StateIterator begin();
+  BENCHMARK_ALWAYS_INLINE StateIterator end();
+
   // Returns true if the benchmark should continue through another iteration.
   // NOTE: A benchmark may not return from the test until KeepRunning() has
   // returned false.
@@ -405,7 +433,7 @@
     if (BENCHMARK_BUILTIN_EXPECT(!started_, false)) {
       StartKeepRunning();
     }
-    bool const res = total_iterations_++ < max_iterations;
+    bool const res = (--total_iterations_ != 0);
     if (BENCHMARK_BUILTIN_EXPECT(!res, false)) {
       FinishKeepRunning();
     }
@@ -415,7 +443,7 @@
   // REQUIRES: timer is running and 'SkipWithError(...)' has not been called
   //           by the current thread.
   // Stop the benchmark timer.  If not called, the timer will be
-  // automatically stopped after KeepRunning() returns false for the first time.
+  // automatically stopped after the last iteration of the benchmark loop.
   //
   // For threaded benchmarks the PauseTiming() function only pauses the timing
   // for the current thread.
@@ -431,7 +459,8 @@
   // REQUIRES: timer is not running and 'SkipWithError(...)' has not been called
   //           by the current thread.
   // Start the benchmark timer.  The timer is NOT running on entrance to the
-  // benchmark function. It begins running after the first call to KeepRunning()
+  // benchmark function. It begins running after control flow enters the
+  // benchmark loop.
   //
   // NOTE: PauseTiming()/ResumeTiming() are relatively
   // heavyweight, and so their use should generally be avoided
@@ -440,9 +469,13 @@
 
   // REQUIRES: 'SkipWithError(...)' has not been called previously by the
   //            current thread.
-  // Skip any future iterations of the 'KeepRunning()' loop in the current
-  // thread and report an error with the specified 'msg'. After this call
-  // the user may explicitly 'return' from the benchmark.
+  // Report the benchmark as resulting in an error with the specified 'msg'.
+  // After this call the user may explicitly 'return' from the benchmark.
+  //
+  // If the ranged-for style of benchmark loop is used, the user must explicitly
+  // break from the loop, otherwise all future iterations will be run.
+  // If the 'KeepRunning()' loop is used the current thread will automatically
+  // exit the loop at the end of the current iteration.
   //
   // For threaded benchmarks only the current thread stops executing and future
   // calls to `KeepRunning()` will block until all threads have completed
@@ -455,7 +488,7 @@
   // responsibility to exit the scope as needed.
   void SkipWithError(const char* msg);
 
-  // REQUIRES: called exactly once per iteration of the KeepRunning loop.
+  // REQUIRES: called exactly once per iteration of the benchmarking loop.
   // Set the manually measured time for this benchmark iteration, which
   // is used instead of automatically measured time if UseManualTime() was
   // specified.
@@ -470,7 +503,7 @@
   // value > 0, the report is printed in MB/sec instead of nanoseconds
   // per iteration.
   //
-  // REQUIRES: a benchmark has exited its KeepRunning loop.
+  // REQUIRES: a benchmark has exited its benchmarking loop.
   BENCHMARK_ALWAYS_INLINE
   void SetBytesProcessed(size_t bytes) { bytes_processed_ = bytes; }
 
@@ -493,7 +526,7 @@
   // executing benchmark. It is typically called at the end of a processing
   // benchmark where a processing items/second output is desired.
   //
-  // REQUIRES: a benchmark has exited its KeepRunning loop.
+  // REQUIRES: a benchmark has exited its benchmarking loop.
   BENCHMARK_ALWAYS_INLINE
   void SetItemsProcessed(size_t items) { items_processed_ = items; }
 
@@ -511,7 +544,7 @@
   // Produces output that looks like:
   //  BM_Compress   50         50   14115038  compress:27.3%
   //
-  // REQUIRES: a benchmark has exited its KeepRunning loop.
+  // REQUIRES: a benchmark has exited its benchmarking loop.
   void SetLabel(const char* label);
 
   void BENCHMARK_ALWAYS_INLINE SetLabel(const std::string& str) {
@@ -532,7 +565,7 @@
   int range_y() const { return range(1); }
 
   BENCHMARK_ALWAYS_INLINE
-  size_t iterations() const { return total_iterations_; }
+  size_t iterations() const { return (max_iterations - total_iterations_) + 1; }
 
  private:
   bool started_;
@@ -570,6 +603,53 @@
   BENCHMARK_DISALLOW_COPY_AND_ASSIGN(State);
 };
 
+struct State::StateIterator {
+  struct BENCHMARK_UNUSED Value {};
+  typedef std::forward_iterator_tag iterator_category;
+  typedef Value value_type;
+  typedef Value reference;
+  typedef Value pointer;
+
+ private:
+  friend class State;
+  BENCHMARK_ALWAYS_INLINE
+  StateIterator() : cached_(0), parent_() {}
+
+  BENCHMARK_ALWAYS_INLINE
+  explicit StateIterator(State* st)
+      : cached_(st->error_occurred_ ? 0 : st->max_iterations), parent_(st) {}
+
+ public:
+  BENCHMARK_ALWAYS_INLINE
+  Value operator*() const { return Value(); }
+
+  BENCHMARK_ALWAYS_INLINE
+  StateIterator& operator++() {
+    assert(cached_ > 0);
+    --cached_;
+    return *this;
+  }
+
+  BENCHMARK_ALWAYS_INLINE
+  bool operator!=(StateIterator const&) const {
+    if (BENCHMARK_BUILTIN_EXPECT(cached_ != 0, true)) return true;
+    parent_->FinishKeepRunning();
+    return false;
+  }
+
+ private:
+  size_t cached_;
+  State* const parent_;
+};
+
+inline BENCHMARK_ALWAYS_INLINE State::StateIterator State::begin() {
+  return StateIterator(this);
+}
+inline BENCHMARK_ALWAYS_INLINE State::StateIterator State::end() {
+  StartKeepRunning();
+  return StateIterator();
+}
+
 namespace internal {
 
 typedef void(Function)(State&);
@@ -698,6 +778,9 @@
   // the asymptotic computational complexity will be shown on the output.
   Benchmark* Complexity(BigOFunc* complexity);
 
+  // Add this statistics to be computed over all the values of benchmark run
+  Benchmark* ComputeStatistics(std::string name, StatisticsFunc* statistics);
+
   // Support for running multiple copies of the same benchmark concurrently
   // in multiple threads.  This may be useful when measuring the scaling
   // of some piece of code.
@@ -758,6 +841,7 @@
   bool use_manual_time_;
   BigO complexity_;
   BigOFunc* complexity_lambda_;
+  std::vector<Statistics> statistics_;
   std::vector<int> thread_counts_;
 
   Benchmark& operator=(Benchmark const&);
@@ -905,7 +989,7 @@
 #define BENCHMARK_RANGE2(n, l1, h1, l2, h2) \
   BENCHMARK(n)->RangePair({{(l1), (h1)}, {(l2), (h2)}})
 
-#if __cplusplus >= 201103L
+#ifdef BENCHMARK_HAS_CXX11
 
 // Register a benchmark which invokes the function specified by `func`
 // with the additional arguments specified by `...`.
@@ -925,7 +1009,7 @@
               #func "/" #test_case_name,                 \
               [](::benchmark::State& st) { func(st, __VA_ARGS__); })))
 
-#endif  // __cplusplus >= 11
+#endif  // BENCHMARK_HAS_CXX11
 
 // This will register a benchmark for a templatized function.  For example:
 //
@@ -946,7 +1030,7 @@
           new ::benchmark::internal::FunctionBenchmark(#n "<" #a "," #b ">", \
                                                        n<a, b>)))
 
-#if __cplusplus >= 201103L
+#ifdef BENCHMARK_HAS_CXX11
 #define BENCHMARK_TEMPLATE(n, ...)                       \
   BENCHMARK_PRIVATE_DECLARE(n) =                         \
       (::benchmark::internal::RegisterBenchmarkInternal( \
@@ -967,10 +1051,63 @@
     virtual void BenchmarkCase(::benchmark::State&);          \
   };
 
+#define BENCHMARK_TEMPLATE1_PRIVATE_DECLARE_F(BaseClass, Method, a) \
+  class BaseClass##_##Method##_Benchmark : public BaseClass<a> {    \
+   public:                                                          \
+    BaseClass##_##Method##_Benchmark() : BaseClass<a>() {           \
+      this->SetName(#BaseClass"<" #a ">/" #Method);                 \
+    }                                                               \
+                                                                    \
+   protected:                                                       \
+    virtual void BenchmarkCase(::benchmark::State&);                \
+  };
+
+#define BENCHMARK_TEMPLATE2_PRIVATE_DECLARE_F(BaseClass, Method, a, b) \
+  class BaseClass##_##Method##_Benchmark : public BaseClass<a, b> {    \
+   public:                                                             \
+    BaseClass##_##Method##_Benchmark() : BaseClass<a, b>() {           \
+      this->SetName(#BaseClass"<" #a "," #b ">/" #Method);             \
+    }                                                                  \
+                                                                       \
+   protected:                                                          \
+    virtual void BenchmarkCase(::benchmark::State&);                   \
+  };
+
+#ifdef BENCHMARK_HAS_CXX11
+#define BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(BaseClass, Method, ...)       \
+  class BaseClass##_##Method##_Benchmark : public BaseClass<__VA_ARGS__> { \
+   public:                                                                 \
+    BaseClass##_##Method##_Benchmark() : BaseClass<__VA_ARGS__>() {        \
+      this->SetName(#BaseClass"<" #__VA_ARGS__ ">/" #Method);              \
+    }                                                                      \
+                                                                           \
+   protected:                                                              \
+    virtual void BenchmarkCase(::benchmark::State&);                       \
+  };
+#else
+#define BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(n, a) BENCHMARK_TEMPLATE1_PRIVATE_DECLARE_F(n, a)
+#endif
+
 #define BENCHMARK_DEFINE_F(BaseClass, Method)    \
   BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method) \
   void BaseClass##_##Method##_Benchmark::BenchmarkCase
 
+#define BENCHMARK_TEMPLATE1_DEFINE_F(BaseClass, Method, a)    \
+  BENCHMARK_TEMPLATE1_PRIVATE_DECLARE_F(BaseClass, Method, a) \
+  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+
+#define BENCHMARK_TEMPLATE2_DEFINE_F(BaseClass, Method, a, b)    \
+  BENCHMARK_TEMPLATE2_PRIVATE_DECLARE_F(BaseClass, Method, a, b) \
+  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+
+#ifdef BENCHMARK_HAS_CXX11
+#define BENCHMARK_TEMPLATE_DEFINE_F(BaseClass, Method, ...)            \
+  BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(BaseClass, Method, __VA_ARGS__) \
+  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+#else
+#define BENCHMARK_TEMPLATE_DEFINE_F(BaseClass, Method, a) BENCHMARK_TEMPLATE1_DEFINE_F(BaseClass, Method, a)
+#endif
+
 #define BENCHMARK_REGISTER_F(BaseClass, Method) \
   BENCHMARK_PRIVATE_REGISTER_F(BaseClass##_##Method##_Benchmark)
 
@@ -984,13 +1121,33 @@
   BENCHMARK_REGISTER_F(BaseClass, Method);       \
   void BaseClass##_##Method##_Benchmark::BenchmarkCase
 
+#define BENCHMARK_TEMPLATE1_F(BaseClass, Method, a)           \
+  BENCHMARK_TEMPLATE1_PRIVATE_DECLARE_F(BaseClass, Method, a) \
+  BENCHMARK_REGISTER_F(BaseClass, Method);                    \
+  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+
+#define BENCHMARK_TEMPLATE2_F(BaseClass, Method, a, b)           \
+  BENCHMARK_TEMPLATE2_PRIVATE_DECLARE_F(BaseClass, Method, a, b) \
+  BENCHMARK_REGISTER_F(BaseClass, Method);                       \
+  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+
+#ifdef BENCHMARK_HAS_CXX11
+#define BENCHMARK_TEMPLATE_F(BaseClass, Method, ...)           \
+  BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(BaseClass, Method, __VA_ARGS__) \
+  BENCHMARK_REGISTER_F(BaseClass, Method);                     \
+  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+#else
+#define BENCHMARK_TEMPLATE_F(BaseClass, Method, a) BENCHMARK_TEMPLATE1_F(BaseClass, Method, a)
+#endif
+
 // Helper macro to create a main routine in a test that runs the benchmarks
 #define BENCHMARK_MAIN()                   \
   int main(int argc, char** argv) {        \
     ::benchmark::Initialize(&argc, argv);  \
     if (::benchmark::ReportUnrecognizedArguments(argc, argv)) return 1; \
     ::benchmark::RunSpecifiedBenchmarks(); \
-  }
+  }                                        \
+  int main(int, char**)
 
 
 // ------------------------------------------------------
@@ -998,6 +1155,26 @@
 
 namespace benchmark {
 
+struct CPUInfo {
+  struct CacheInfo {
+    std::string type;
+    int level;
+    int size;
+    int num_sharing;
+  };
+
+  int num_cpus;
+  double cycles_per_second;
+  std::vector<CacheInfo> caches;
+  bool scaling_enabled;
+
+  static const CPUInfo& Get();
+
+ private:
+  CPUInfo();
+  BENCHMARK_DISALLOW_COPY_AND_ASSIGN(CPUInfo);
+};
+
 // Interface for custom benchmark result printers.
 // By default, benchmark reports are printed to stdout. However an application
 // can control the destination of the reports by calling
@@ -1006,12 +1183,11 @@
 class BenchmarkReporter {
  public:
   struct Context {
-    int num_cpus;
-    double mhz_per_cpu;
-    bool cpu_scaling_enabled;
-
+    CPUInfo const& cpu_info;
     // The number of chars in the longest benchmark name.
     size_t name_field_width;
+
+    Context();
   };
 
   struct Run {
@@ -1065,6 +1241,9 @@
     BigOFunc* complexity_lambda;
     int complexity_n;
 
+    // what statistics to compute from the measurements
+    const std::vector<Statistics>* statistics;
+
     // Inform print function whether the current run is a complexity report
     bool report_big_o;
     bool report_rms;
diff --git a/include/benchmark/benchmark_api.h b/include/benchmark/benchmark_api.h
deleted file mode 100644
index a9ae671..0000000
--- a/include/benchmark/benchmark_api.h
+++ /dev/null
@@ -1,27 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#ifndef BENCHMARK_BENCHMARK_API_H_
-#define BENCHMARK_BENCHMARK_API_H_
-
-#ifdef __DEPRECATED
-# ifndef BENCHMARK_WARNING_MSG
-#   warning the benchmark_api.h header has been deprecated and will be removed, please include benchmark.h instead
-# else
-    BENCHMARK_WARNING_MSG("the benchmark_api.h header has been deprecated and will be removed, please include benchmark.h instead")
-# endif
-#endif
-
-#include "benchmark.h"  // For forward declaration of BenchmarkReporter
-
-#endif  // BENCHMARK_BENCHMARK_API_H_
diff --git a/include/benchmark/reporter.h b/include/benchmark/reporter.h
deleted file mode 100644
index 5baca1a..0000000
--- a/include/benchmark/reporter.h
+++ /dev/null
@@ -1,27 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#ifndef BENCHMARK_REPORTER_H_
-#define BENCHMARK_REPORTER_H_
-
-#ifdef __DEPRECATED
-# ifndef BENCHMARK_WARNING_MSG
-#   warning the reporter.h header has been deprecated and will be removed, please include benchmark.h instead
-# else
-    BENCHMARK_WARNING_MSG("the reporter.h header has been deprecated and will be removed, please include benchmark.h instead")
-# endif
-#endif
-
-#include "benchmark.h"  // For forward declaration of BenchmarkReporter
-
-#endif  // BENCHMARK_REPORTER_H_
diff --git a/releasing.md b/releasing.md
new file mode 100644
index 0000000..f0cd701
--- /dev/null
+++ b/releasing.md
@@ -0,0 +1,16 @@
+# How to release
+
+* Make sure you're on master and synced to HEAD
+* Ensure the project builds and tests run (sanity check only, obviously)
+    * `parallel -j0 exec ::: test/*_test` can help ensure everything at least
+      passes
+* Prepare release notes
+    * `git log $(git describe --abbrev=0 --tags)..HEAD` gives you the list of
+      commits between the last annotated tag and HEAD
+    * Pick the most interesting.
+* Create a release through github's interface
+    * Note this will create a lightweight tag.
+    * Update this to an annotated tag:
+      * `git pull --tags`
+      * `git tag -a -f <tag> <tag>`
+      * `git push --force origin`
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 244484b..e22620a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -38,11 +38,13 @@
 set(lib_install_dir "lib/")
 set(bin_install_dir "bin/")
 set(config_install_dir "lib/cmake/${PROJECT_NAME}")
+set(pkgconfig_install_dir "lib/pkgconfig")
 
 set(generated_dir "${CMAKE_CURRENT_BINARY_DIR}/generated")
 
 set(version_config "${generated_dir}/${PROJECT_NAME}ConfigVersion.cmake")
 set(project_config "${generated_dir}/${PROJECT_NAME}Config.cmake")
+set(pkg_config "${generated_dir}/${PROJECT_NAME}.pc")
 set(targets_export_name "${PROJECT_NAME}Targets")
 
 set(namespace "${PROJECT_NAME}::")
@@ -53,26 +55,33 @@
 )
 
 configure_file("${PROJECT_SOURCE_DIR}/cmake/Config.cmake.in" "${project_config}" @ONLY)
+configure_file("${PROJECT_SOURCE_DIR}/cmake/benchmark.pc.in" "${pkg_config}" @ONLY)
 
-# Install target (will install the library to specified CMAKE_INSTALL_PREFIX variable)
-install(
-  TARGETS benchmark
-  EXPORT ${targets_export_name}
-  ARCHIVE DESTINATION ${lib_install_dir}
-  LIBRARY DESTINATION ${lib_install_dir}
-  RUNTIME DESTINATION ${bin_install_dir}
-  INCLUDES DESTINATION ${include_install_dir})
+if (BENCHMARK_ENABLE_INSTALL)
+  # Install target (will install the library to specified CMAKE_INSTALL_PREFIX variable)
+  install(
+    TARGETS benchmark
+    EXPORT ${targets_export_name}
+    ARCHIVE DESTINATION ${lib_install_dir}
+    LIBRARY DESTINATION ${lib_install_dir}
+    RUNTIME DESTINATION ${bin_install_dir}
+    INCLUDES DESTINATION ${include_install_dir})
 
-install(
-  DIRECTORY "${PROJECT_SOURCE_DIR}/include/benchmark"
-  DESTINATION ${include_install_dir}
-  FILES_MATCHING PATTERN "*.*h")
+  install(
+    DIRECTORY "${PROJECT_SOURCE_DIR}/include/benchmark"
+    DESTINATION ${include_install_dir}
+    FILES_MATCHING PATTERN "*.*h")
 
-install(
-    FILES "${project_config}" "${version_config}"
-    DESTINATION "${config_install_dir}")
+  install(
+      FILES "${project_config}" "${version_config}"
+      DESTINATION "${config_install_dir}")
 
-install(
-    EXPORT "${targets_export_name}"
-    NAMESPACE "${namespace}"
-    DESTINATION "${config_install_dir}")
+  install(
+      FILES "${pkg_config}"
+      DESTINATION "${pkgconfig_install_dir}")
+
+  install(
+      EXPORT "${targets_export_name}"
+      NAMESPACE "${namespace}"
+      DESTINATION "${config_install_dir}")
+endif()
diff --git a/src/benchmark.cc b/src/benchmark.cc
index 1ba0a50..1a7d218 100644
--- a/src/benchmark.cc
+++ b/src/benchmark.cc
@@ -38,12 +38,12 @@
 #include "commandlineflags.h"
 #include "complexity.h"
 #include "counter.h"
+#include "internal_macros.h"
 #include "log.h"
 #include "mutex.h"
 #include "re.h"
-#include "stat.h"
+#include "statistics.h"
 #include "string_util.h"
-#include "sysinfo.h"
 #include "timers.h"
 
 DEFINE_bool(benchmark_list_tests, false,
@@ -99,20 +99,15 @@
 DEFINE_int32(v, 0, "The level of verbose logging to output");
 
 namespace benchmark {
-namespace internal {
-
-void UseCharPointer(char const volatile*) {}
-
-}  // end namespace internal
 
 namespace {
-
 static const size_t kMaxIterations = 1000000000;
-
 }  // end namespace
 
 namespace internal {
 
+void UseCharPointer(char const volatile*) {}
+
 class ThreadManager {
  public:
   ThreadManager(int num_threads)
@@ -180,7 +175,9 @@
     CHECK(running_);
     running_ = false;
     real_time_used_ += ChronoClockNow() - start_real_time_;
-    cpu_time_used_ += ThreadCPUUsage() - start_cpu_time_;
+    // Floating point error can result in the subtraction producing a negative
+    // time. Guard against that.
+    cpu_time_used_ += std::max<double>(ThreadCPUUsage() - start_cpu_time_, 0);
   }
 
   // Called by each thread
@@ -256,6 +253,7 @@
     report.complexity_n = results.complexity_n;
     report.complexity = b.complexity;
     report.complexity_lambda = b.complexity_lambda;
+    report.statistics = b.statistics;
     report.counters = results.counters;
     internal::Finish(&report.counters, seconds, b.threads);
   }
@@ -401,7 +399,7 @@
              internal::ThreadManager* manager)
     : started_(false),
       finished_(false),
-      total_iterations_(0),
+      total_iterations_(max_iters + 1),
       range_(ranges),
       bytes_processed_(0),
       items_processed_(0),
@@ -414,6 +412,7 @@
       timer_(timer),
       manager_(manager) {
   CHECK(max_iterations != 0) << "At least one iteration must be run";
+  CHECK(total_iterations_ != 0) << "max iterations wrapped around";
   CHECK_LT(thread_index, threads) << "thread_index must be less than threads";
 }
 
@@ -438,7 +437,7 @@
       manager_->results.has_error_ = true;
     }
   }
-  total_iterations_ = max_iterations;
+  total_iterations_ = 1;
   if (timer_->running()) timer_->StopTimer();
 }
 
@@ -463,8 +462,8 @@
   if (!error_occurred_) {
     PauseTiming();
   }
-  // Total iterations now is one greater than max iterations. Fix this.
-  total_iterations_ = max_iterations;
+  // Total iterations has now wrapped around zero. Fix this.
+  total_iterations_ = 1;
   finished_ = true;
   manager_->StartStopBarrier();
 }
@@ -481,19 +480,19 @@
   // Determine the width of the name field using a minimum width of 10.
   bool has_repetitions = FLAGS_benchmark_repetitions > 1;
   size_t name_field_width = 10;
+  size_t stat_field_width = 0;
   for (const Benchmark::Instance& benchmark : benchmarks) {
     name_field_width =
         std::max<size_t>(name_field_width, benchmark.name.size());
     has_repetitions |= benchmark.repetitions > 1;
+
+    for(const auto& Stat : *benchmark.statistics)
+      stat_field_width = std::max<size_t>(stat_field_width, Stat.name_.size());
   }
-  if (has_repetitions) name_field_width += std::strlen("_stddev");
+  if (has_repetitions) name_field_width += 1 + stat_field_width;
 
   // Print header here
   BenchmarkReporter::Context context;
-  context.num_cpus = NumCPUs();
-  context.mhz_per_cpu = CyclesPerSecond() / 1000000.0f;
-
-  context.cpu_scaling_enabled = CpuScalingEnabled();
   context.name_field_width = name_field_width;
 
   // Keep track of runing times of all instances of current benchmark
diff --git a/src/benchmark_api_internal.h b/src/benchmark_api_internal.h
index 36d2340..d481dc5 100644
--- a/src/benchmark_api_internal.h
+++ b/src/benchmark_api_internal.h
@@ -25,6 +25,7 @@
   BigO complexity;
   BigOFunc* complexity_lambda;
   UserCounters counters;
+  const std::vector<Statistics>* statistics;
   bool last_benchmark_instance;
   int repetitions;
   double min_time;
diff --git a/src/benchmark_register.cc b/src/benchmark_register.cc
index ed70d82..d5746a3 100644
--- a/src/benchmark_register.cc
+++ b/src/benchmark_register.cc
@@ -37,12 +37,11 @@
 #include "check.h"
 #include "commandlineflags.h"
 #include "complexity.h"
+#include "statistics.h"
 #include "log.h"
 #include "mutex.h"
 #include "re.h"
-#include "stat.h"
 #include "string_util.h"
-#include "sysinfo.h"
 #include "timers.h"
 
 namespace benchmark {
@@ -159,6 +158,7 @@
         instance.use_manual_time = family->use_manual_time_;
         instance.complexity = family->complexity_;
         instance.complexity_lambda = family->complexity_lambda_;
+        instance.statistics = &family->statistics_;
         instance.threads = num_threads;
 
         // Add arguments to instance name
@@ -236,7 +236,11 @@
       use_real_time_(false),
       use_manual_time_(false),
       complexity_(oNone),
-      complexity_lambda_(nullptr) {}
+      complexity_lambda_(nullptr) {
+  ComputeStatistics("mean", StatisticsMean);
+  ComputeStatistics("median", StatisticsMedian);
+  ComputeStatistics("stddev", StatisticsStdDev);
+}
 
 Benchmark::~Benchmark() {}
 
@@ -409,6 +413,12 @@
   return this;
 }
 
+Benchmark* Benchmark::ComputeStatistics(std::string name,
+                                        StatisticsFunc* statistics) {
+  statistics_.emplace_back(name, statistics);
+  return this;
+}
+
 Benchmark* Benchmark::Threads(int t) {
   CHECK_GT(t, 0);
   thread_counts_.push_back(t);
@@ -437,8 +447,7 @@
 }
 
 Benchmark* Benchmark::ThreadPerCpu() {
-  static int num_cpus = NumCPUs();
-  thread_counts_.push_back(num_cpus);
+  thread_counts_.push_back(CPUInfo::Get().num_cpus);
   return this;
 }
 
diff --git a/src/complexity.cc b/src/complexity.cc
index 33975be..8883269 100644
--- a/src/complexity.cc
+++ b/src/complexity.cc
@@ -21,7 +21,6 @@
 #include <cmath>
 #include "check.h"
 #include "complexity.h"
-#include "stat.h"
 
 namespace benchmark {
 
@@ -150,109 +149,6 @@
   return best_fit;
 }
 
-std::vector<BenchmarkReporter::Run> ComputeStats(
-    const std::vector<BenchmarkReporter::Run>& reports) {
-  typedef BenchmarkReporter::Run Run;
-  std::vector<Run> results;
-
-  auto error_count =
-      std::count_if(reports.begin(), reports.end(),
-                    [](Run const& run) { return run.error_occurred; });
-
-  if (reports.size() - error_count < 2) {
-    // We don't report aggregated data if there was a single run.
-    return results;
-  }
-  // Accumulators.
-  Stat1_d real_accumulated_time_stat;
-  Stat1_d cpu_accumulated_time_stat;
-  Stat1_d bytes_per_second_stat;
-  Stat1_d items_per_second_stat;
-  // All repetitions should be run with the same number of iterations so we
-  // can take this information from the first benchmark.
-  int64_t const run_iterations = reports.front().iterations;
-  // create stats for user counters
-  struct CounterStat {
-    Counter c;
-    Stat1_d s;
-  };
-  std::map< std::string, CounterStat > counter_stats;
-  for(Run const& r : reports) {
-    for(auto const& cnt : r.counters) {
-      auto it = counter_stats.find(cnt.first);
-      if(it == counter_stats.end()) {
-        counter_stats.insert({cnt.first, {cnt.second, Stat1_d{}}});
-      } else {
-        CHECK_EQ(counter_stats[cnt.first].c.flags, cnt.second.flags);
-      }
-    }
-  }
-
-  // Populate the accumulators.
-  for (Run const& run : reports) {
-    CHECK_EQ(reports[0].benchmark_name, run.benchmark_name);
-    CHECK_EQ(run_iterations, run.iterations);
-    if (run.error_occurred) continue;
-    real_accumulated_time_stat +=
-        Stat1_d(run.real_accumulated_time / run.iterations);
-    cpu_accumulated_time_stat +=
-        Stat1_d(run.cpu_accumulated_time / run.iterations);
-    items_per_second_stat += Stat1_d(run.items_per_second);
-    bytes_per_second_stat += Stat1_d(run.bytes_per_second);
-    // user counters
-    for(auto const& cnt : run.counters) {
-      auto it = counter_stats.find(cnt.first);
-      CHECK_NE(it, counter_stats.end());
-      it->second.s += Stat1_d(cnt.second);
-    }
-  }
-
-  // Get the data from the accumulator to BenchmarkReporter::Run's.
-  Run mean_data;
-  mean_data.benchmark_name = reports[0].benchmark_name + "_mean";
-  mean_data.iterations = run_iterations;
-  mean_data.real_accumulated_time =
-      real_accumulated_time_stat.Mean() * run_iterations;
-  mean_data.cpu_accumulated_time =
-      cpu_accumulated_time_stat.Mean() * run_iterations;
-  mean_data.bytes_per_second = bytes_per_second_stat.Mean();
-  mean_data.items_per_second = items_per_second_stat.Mean();
-  mean_data.time_unit = reports[0].time_unit;
-  // user counters
-  for(auto const& kv : counter_stats) {
-    auto c = Counter(kv.second.s.Mean(), counter_stats[kv.first].c.flags);
-    mean_data.counters[kv.first] = c;
-  }
-
-  // Only add label to mean/stddev if it is same for all runs
-  mean_data.report_label = reports[0].report_label;
-  for (std::size_t i = 1; i < reports.size(); i++) {
-    if (reports[i].report_label != reports[0].report_label) {
-      mean_data.report_label = "";
-      break;
-    }
-  }
-
-  Run stddev_data;
-  stddev_data.benchmark_name = reports[0].benchmark_name + "_stddev";
-  stddev_data.report_label = mean_data.report_label;
-  stddev_data.iterations = 0;
-  stddev_data.real_accumulated_time = real_accumulated_time_stat.StdDev();
-  stddev_data.cpu_accumulated_time = cpu_accumulated_time_stat.StdDev();
-  stddev_data.bytes_per_second = bytes_per_second_stat.StdDev();
-  stddev_data.items_per_second = items_per_second_stat.StdDev();
-  stddev_data.time_unit = reports[0].time_unit;
-  // user counters
-  for(auto const& kv : counter_stats) {
-    auto c = Counter(kv.second.s.StdDev(), counter_stats[kv.first].c.flags);
-    stddev_data.counters[kv.first] = c;
-  }
-
-  results.push_back(mean_data);
-  results.push_back(stddev_data);
-  return results;
-}
-
 std::vector<BenchmarkReporter::Run> ComputeBigO(
     const std::vector<BenchmarkReporter::Run>& reports) {
   typedef BenchmarkReporter::Run Run;
diff --git a/src/complexity.h b/src/complexity.h
index c0ca60e..df29b48 100644
--- a/src/complexity.h
+++ b/src/complexity.h
@@ -25,12 +25,6 @@
 
 namespace benchmark {
 
-// Return a vector containing the mean and standard devation information for
-// the specified list of reports. If 'reports' contains less than two
-// non-errored runs an empty vector is returned
-std::vector<BenchmarkReporter::Run> ComputeStats(
-    const std::vector<BenchmarkReporter::Run>& reports);
-
 // Return a vector containing the bigO and RMS information for the specified
 // list of reports. If 'reports.size() < 2' an empty vector is returned.
 std::vector<BenchmarkReporter::Run> ComputeBigO(
@@ -57,4 +51,5 @@
 std::string GetBigOString(BigO complexity);
 
 }  // end namespace benchmark
+
 #endif  // COMPLEXITY_H_
diff --git a/src/console_reporter.cc b/src/console_reporter.cc
index 4bb6f71..48920ca 100644
--- a/src/console_reporter.cc
+++ b/src/console_reporter.cc
@@ -148,12 +148,14 @@
   }
 
   for (auto& c : result.counters) {
-    auto const& s = HumanReadableNumber(c.second.value);
+    const std::size_t cNameLen = std::max(std::string::size_type(10),
+                                          c.first.length());
+    auto const& s = HumanReadableNumber(c.second.value, 1000);
     if (output_options_ & OO_Tabular) {
       if (c.second.flags & Counter::kIsRate) {
-        printer(Out, COLOR_DEFAULT, " %8s/s", s.c_str());
+        printer(Out, COLOR_DEFAULT, " %*s/s", cNameLen - 2, s.c_str());
       } else {
-        printer(Out, COLOR_DEFAULT, " %10s", s.c_str());
+        printer(Out, COLOR_DEFAULT, " %*s", cNameLen, s.c_str());
       }
     } else {
       const char* unit = (c.second.flags & Counter::kIsRate) ? "/s" : "";
diff --git a/src/internal_macros.h b/src/internal_macros.h
index 9428874..6cdb3c2 100644
--- a/src/internal_macros.h
+++ b/src/internal_macros.h
@@ -6,6 +6,9 @@
 #ifndef __has_feature
 #define __has_feature(x) 0
 #endif
+#ifndef __has_builtin
+#define __has_builtin(x) 0
+#endif
 
 #if defined(__clang__)
 #define COMPILER_CLANG
@@ -39,6 +42,8 @@
   #endif
 #elif defined(__FreeBSD__)
 #define BENCHMARK_OS_FREEBSD 1
+#elif defined(__NetBSD__)
+#define BENCHMARK_OS_NETBSD 1
 #elif defined(__linux__)
 #define BENCHMARK_OS_LINUX 1
 #elif defined(__native_client__)
@@ -54,4 +59,18 @@
 #define BENCHMARK_HAS_NO_EXCEPTIONS
 #endif
 
+#if defined(COMPILER_CLANG) || defined(COMPILER_GCC)
+#define BENCHMARK_MAYBE_UNUSED __attribute__((unused))
+#else
+#define BENCHMARK_MAYBE_UNUSED
+#endif
+
+#if defined(COMPILER_GCC) || __has_builtin(__builtin_unreachable)
+#define BENCHMARK_UNREACHABLE() __builtin_unreachable()
+#elif defined(COMPILER_MSVC)
+#define BENCHMARK_UNREACHABLE() __assume(false)
+#else
+#define BENCHMARK_UNREACHABLE() ((void)0)
+#endif
+
 #endif  // BENCHMARK_INTERNAL_MACROS_H_
diff --git a/src/json_reporter.cc b/src/json_reporter.cc
index edf6ecc..b5ae302 100644
--- a/src/json_reporter.cc
+++ b/src/json_reporter.cc
@@ -21,6 +21,8 @@
 #include <string>
 #include <tuple>
 #include <vector>
+#include <iomanip> // for setprecision
+#include <limits>
 
 #include "string_util.h"
 #include "timers.h"
@@ -48,7 +50,14 @@
 }
 
 std::string FormatKV(std::string const& key, double value) {
-  return StringPrintF("\"%s\": %.2f", key.c_str(), value);
+  std::stringstream ss;
+  ss << '"' << key << "\": ";
+
+  const auto max_digits10 = std::numeric_limits<decltype (value)>::max_digits10;
+  const auto max_fractional_digits10 = max_digits10 - 1;
+
+  ss << std::scientific << std::setprecision(max_fractional_digits10) << value;
+  return ss.str();
 }
 
 int64_t RoundDouble(double v) { return static_cast<int64_t>(v + 0.5); }
@@ -68,13 +77,37 @@
   std::string walltime_value = LocalDateTimeString();
   out << indent << FormatKV("date", walltime_value) << ",\n";
 
-  out << indent << FormatKV("num_cpus", static_cast<int64_t>(context.num_cpus))
+  CPUInfo const& info = context.cpu_info;
+  out << indent << FormatKV("num_cpus", static_cast<int64_t>(info.num_cpus))
       << ",\n";
-  out << indent << FormatKV("mhz_per_cpu", RoundDouble(context.mhz_per_cpu))
+  out << indent
+      << FormatKV("mhz_per_cpu",
+                  RoundDouble(info.cycles_per_second / 1000000.0))
       << ",\n";
-  out << indent << FormatKV("cpu_scaling_enabled", context.cpu_scaling_enabled)
+  out << indent << FormatKV("cpu_scaling_enabled", info.scaling_enabled)
       << ",\n";
 
+  out << indent << "\"caches\": [\n";
+  indent = std::string(6, ' ');
+  std::string cache_indent(8, ' ');
+  for (size_t i = 0; i < info.caches.size(); ++i) {
+    auto& CI = info.caches[i];
+    out << indent << "{\n";
+    out << cache_indent << FormatKV("type", CI.type) << ",\n";
+    out << cache_indent << FormatKV("level", static_cast<int64_t>(CI.level))
+        << ",\n";
+    out << cache_indent
+        << FormatKV("size", static_cast<int64_t>(CI.size) * 1000u) << ",\n";
+    out << cache_indent
+        << FormatKV("num_sharing", static_cast<int64_t>(CI.num_sharing))
+        << "\n";
+    out << indent << "}";
+    if (i != info.caches.size() - 1) out << ",";
+    out << "\n";
+  }
+  indent = std::string(4, ' ');
+  out << indent << "],\n";
+
 #if defined(NDEBUG)
   const char build_type[] = "release";
 #else
@@ -125,18 +158,18 @@
   if (!run.report_big_o && !run.report_rms) {
     out << indent << FormatKV("iterations", run.iterations) << ",\n";
     out << indent
-        << FormatKV("real_time", RoundDouble(run.GetAdjustedRealTime()))
+        << FormatKV("real_time", run.GetAdjustedRealTime())
         << ",\n";
     out << indent
-        << FormatKV("cpu_time", RoundDouble(run.GetAdjustedCPUTime()));
+        << FormatKV("cpu_time", run.GetAdjustedCPUTime());
     out << ",\n"
         << indent << FormatKV("time_unit", GetTimeUnitString(run.time_unit));
   } else if (run.report_big_o) {
     out << indent
-        << FormatKV("cpu_coefficient", RoundDouble(run.GetAdjustedCPUTime()))
+        << FormatKV("cpu_coefficient", run.GetAdjustedCPUTime())
         << ",\n";
     out << indent
-        << FormatKV("real_coefficient", RoundDouble(run.GetAdjustedRealTime()))
+        << FormatKV("real_coefficient", run.GetAdjustedRealTime())
         << ",\n";
     out << indent << FormatKV("big_o", GetBigOString(run.complexity)) << ",\n";
     out << indent << FormatKV("time_unit", GetTimeUnitString(run.time_unit));
@@ -147,17 +180,17 @@
   if (run.bytes_per_second > 0.0) {
     out << ",\n"
         << indent
-        << FormatKV("bytes_per_second", RoundDouble(run.bytes_per_second));
+        << FormatKV("bytes_per_second", run.bytes_per_second);
   }
   if (run.items_per_second > 0.0) {
     out << ",\n"
         << indent
-        << FormatKV("items_per_second", RoundDouble(run.items_per_second));
+        << FormatKV("items_per_second", run.items_per_second);
   }
   for(auto &c : run.counters) {
     out << ",\n"
         << indent
-        << FormatKV(c.first, RoundDouble(c.second));
+        << FormatKV(c.first, c.second);
   }
   if (!run.report_label.empty()) {
     out << ",\n" << indent << FormatKV("label", run.report_label);
diff --git a/src/reporter.cc b/src/reporter.cc
index aacd453..5d2fa05 100644
--- a/src/reporter.cc
+++ b/src/reporter.cc
@@ -22,7 +22,6 @@
 #include <vector>
 
 #include "check.h"
-#include "stat.h"
 
 namespace benchmark {
 
@@ -36,12 +35,24 @@
   CHECK(out) << "cannot be null";
   auto &Out = *out;
 
-  Out << "Run on (" << context.num_cpus << " X " << context.mhz_per_cpu
-      << " MHz CPU " << ((context.num_cpus > 1) ? "s" : "") << ")\n";
-
   Out << LocalDateTimeString() << "\n";
 
-  if (context.cpu_scaling_enabled) {
+  const CPUInfo &info = context.cpu_info;
+  Out << "Run on (" << info.num_cpus << " X "
+      << (info.cycles_per_second / 1000000.0) << " MHz CPU "
+      << ((info.num_cpus > 1) ? "s" : "") << ")\n";
+  if (info.caches.size() != 0) {
+    Out << "CPU Caches:\n";
+    for (auto &CInfo : info.caches) {
+      Out << "  L" << CInfo.level << " " << CInfo.type << " "
+          << (CInfo.size / 1000) << "K";
+      if (CInfo.num_sharing != 0)
+        Out << " (x" << (info.num_cpus / CInfo.num_sharing) << ")";
+      Out << "\n";
+    }
+  }
+
+  if (info.scaling_enabled) {
     Out << "***WARNING*** CPU scaling is enabled, the benchmark "
            "real time measurements may be noisy and will incur extra "
            "overhead.\n";
@@ -53,6 +64,8 @@
 #endif
 }
 
+BenchmarkReporter::Context::Context() : cpu_info(CPUInfo::Get()) {}
+
 double BenchmarkReporter::Run::GetAdjustedRealTime() const {
   double new_time = real_accumulated_time * GetTimeUnitMultiplier(time_unit);
   if (iterations != 0) new_time /= static_cast<double>(iterations);
diff --git a/src/stat.h b/src/stat.h
deleted file mode 100644
index d356875..0000000
--- a/src/stat.h
+++ /dev/null
@@ -1,310 +0,0 @@
-#ifndef BENCHMARK_STAT_H_
-#define BENCHMARK_STAT_H_
-
-#include <cmath>
-#include <limits>
-#include <ostream>
-#include <type_traits>
-
-namespace benchmark {
-
-template <typename VType, typename NumType>
-class Stat1;
-
-template <typename VType, typename NumType>
-class Stat1MinMax;
-
-typedef Stat1<float, int64_t> Stat1_f;
-typedef Stat1<double, int64_t> Stat1_d;
-typedef Stat1MinMax<float, int64_t> Stat1MinMax_f;
-typedef Stat1MinMax<double, int64_t> Stat1MinMax_d;
-
-template <typename VType>
-class Vector2;
-template <typename VType>
-class Vector3;
-template <typename VType>
-class Vector4;
-
-template <typename VType, typename NumType>
-class Stat1 {
- public:
-  typedef Stat1<VType, NumType> Self;
-
-  Stat1() { Clear(); }
-  // Create a sample of value dat and weight 1
-  explicit Stat1(const VType &dat) {
-    sum_ = dat;
-    sum_squares_ = Sqr(dat);
-    numsamples_ = 1;
-  }
-  // Create statistics for all the samples between begin (included)
-  // and end(excluded)
-  explicit Stat1(const VType *begin, const VType *end) {
-    Clear();
-    for (const VType *item = begin; item < end; ++item) {
-      (*this) += Stat1(*item);
-    }
-  }
-  // Create a sample of value dat and weight w
-  Stat1(const VType &dat, const NumType &w) {
-    sum_ = w * dat;
-    sum_squares_ = w * Sqr(dat);
-    numsamples_ = w;
-  }
-  // Copy operator
-  Stat1(const Self &stat) {
-    sum_ = stat.sum_;
-    sum_squares_ = stat.sum_squares_;
-    numsamples_ = stat.numsamples_;
-  }
-
-  void Clear() {
-    numsamples_ = NumType();
-    sum_squares_ = sum_ = VType();
-  }
-
-  Self &operator=(const Self &stat) {
-    sum_ = stat.sum_;
-    sum_squares_ = stat.sum_squares_;
-    numsamples_ = stat.numsamples_;
-    return (*this);
-  }
-  // Merge statistics from two sample sets.
-  Self &operator+=(const Self &stat) {
-    sum_ += stat.sum_;
-    sum_squares_ += stat.sum_squares_;
-    numsamples_ += stat.numsamples_;
-    return (*this);
-  }
-  // The operation opposite to +=
-  Self &operator-=(const Self &stat) {
-    sum_ -= stat.sum_;
-    sum_squares_ -= stat.sum_squares_;
-    numsamples_ -= stat.numsamples_;
-    return (*this);
-  }
-  // Multiply the weight of the set of samples by a factor k
-  Self &operator*=(const VType &k) {
-    sum_ *= k;
-    sum_squares_ *= k;
-    numsamples_ *= k;
-    return (*this);
-  }
-
-  // Merge statistics from two sample sets.
-  Self operator+(const Self &stat) const { return Self(*this) += stat; }
-
-  // The operation opposite to +
-  Self operator-(const Self &stat) const { return Self(*this) -= stat; }
-
-  // Multiply the weight of the set of samples by a factor k
-  Self operator*(const VType &k) const { return Self(*this) *= k; }
-
-  // Return the total weight of this sample set
-  NumType numSamples() const { return numsamples_; }
-
-  // Return the sum of this sample set
-  VType Sum() const { return sum_; }
-
-  // Return the mean of this sample set
-  VType Mean() const {
-    if (numsamples_ == 0) return VType();
-    return sum_ * (1.0 / numsamples_);
-  }
-
-  // Return the mean of this sample set and compute the standard deviation at
-  // the same time.
-  VType Mean(VType *stddev) const {
-    if (numsamples_ == 0) return VType();
-    VType mean = sum_ * (1.0 / numsamples_);
-    if (stddev) {
-      // Sample standard deviation is undefined for n = 1
-      if (numsamples_ == 1) {
-        *stddev = VType();
-      } else {
-        VType avg_squares = sum_squares_ * (1.0 / numsamples_);
-        *stddev = Sqrt(numsamples_ / (numsamples_ - 1.0) * (avg_squares - Sqr(mean)));
-      }
-    }
-    return mean;
-  }
-
-  // Return the standard deviation of the sample set
-  VType StdDev() const {
-    VType stddev = VType();
-    Mean(&stddev);
-    return stddev;
-  }
-
- private:
-  static_assert(std::is_integral<NumType>::value &&
-                    !std::is_same<NumType, bool>::value,
-                "NumType must be an integral type that is not bool.");
-  // Let i be the index of the samples provided (using +=)
-  // and weight[i],value[i] be the data of sample #i
-  // then the variables have the following meaning:
-  NumType numsamples_;  // sum of weight[i];
-  VType sum_;           // sum of weight[i]*value[i];
-  VType sum_squares_;   // sum of weight[i]*value[i]^2;
-
-  // Template function used to square a number.
-  // For a vector we square all components
-  template <typename SType>
-  static inline SType Sqr(const SType &dat) {
-    return dat * dat;
-  }
-
-  template <typename SType>
-  static inline Vector2<SType> Sqr(const Vector2<SType> &dat) {
-    return dat.MulComponents(dat);
-  }
-
-  template <typename SType>
-  static inline Vector3<SType> Sqr(const Vector3<SType> &dat) {
-    return dat.MulComponents(dat);
-  }
-
-  template <typename SType>
-  static inline Vector4<SType> Sqr(const Vector4<SType> &dat) {
-    return dat.MulComponents(dat);
-  }
-
-  // Template function used to take the square root of a number.
-  // For a vector we square all components
-  template <typename SType>
-  static inline SType Sqrt(const SType &dat) {
-    // Avoid NaN due to imprecision in the calculations
-    if (dat < 0) return 0;
-    return sqrt(dat);
-  }
-
-  template <typename SType>
-  static inline Vector2<SType> Sqrt(const Vector2<SType> &dat) {
-    // Avoid NaN due to imprecision in the calculations
-    return Max(dat, Vector2<SType>()).Sqrt();
-  }
-
-  template <typename SType>
-  static inline Vector3<SType> Sqrt(const Vector3<SType> &dat) {
-    // Avoid NaN due to imprecision in the calculations
-    return Max(dat, Vector3<SType>()).Sqrt();
-  }
-
-  template <typename SType>
-  static inline Vector4<SType> Sqrt(const Vector4<SType> &dat) {
-    // Avoid NaN due to imprecision in the calculations
-    return Max(dat, Vector4<SType>()).Sqrt();
-  }
-};
-
-// Useful printing function
-template <typename VType, typename NumType>
-std::ostream &operator<<(std::ostream &out, const Stat1<VType, NumType> &s) {
-  out << "{ avg = " << s.Mean() << " std = " << s.StdDev()
-      << " nsamples = " << s.NumSamples() << "}";
-  return out;
-}
-
-// Stat1MinMax: same as Stat1, but it also
-// keeps the Min and Max values; the "-"
-// operator is disabled because it cannot be implemented
-// efficiently
-template <typename VType, typename NumType>
-class Stat1MinMax : public Stat1<VType, NumType> {
- public:
-  typedef Stat1MinMax<VType, NumType> Self;
-
-  Stat1MinMax() { Clear(); }
-  // Create a sample of value dat and weight 1
-  explicit Stat1MinMax(const VType &dat) : Stat1<VType, NumType>(dat) {
-    max_ = dat;
-    min_ = dat;
-  }
-  // Create statistics for all the samples between begin (included)
-  // and end(excluded)
-  explicit Stat1MinMax(const VType *begin, const VType *end) {
-    Clear();
-    for (const VType *item = begin; item < end; ++item) {
-      (*this) += Stat1MinMax(*item);
-    }
-  }
-  // Create a sample of value dat and weight w
-  Stat1MinMax(const VType &dat, const NumType &w)
-      : Stat1<VType, NumType>(dat, w) {
-    max_ = dat;
-    min_ = dat;
-  }
-  // Copy operator
-  Stat1MinMax(const Self &stat) : Stat1<VType, NumType>(stat) {
-    max_ = stat.max_;
-    min_ = stat.min_;
-  }
-
-  void Clear() {
-    Stat1<VType, NumType>::Clear();
-    if (std::numeric_limits<VType>::has_infinity) {
-      min_ = std::numeric_limits<VType>::infinity();
-      max_ = -std::numeric_limits<VType>::infinity();
-    } else {
-      min_ = std::numeric_limits<VType>::max();
-      max_ = std::numeric_limits<VType>::min();
-    }
-  }
-
-  Self &operator=(const Self &stat) {
-    this->Stat1<VType, NumType>::operator=(stat);
-    max_ = stat.max_;
-    min_ = stat.min_;
-    return (*this);
-  }
-  // Merge statistics from two sample sets.
-  Self &operator+=(const Self &stat) {
-    this->Stat1<VType, NumType>::operator+=(stat);
-    if (stat.max_ > max_) max_ = stat.max_;
-    if (stat.min_ < min_) min_ = stat.min_;
-    return (*this);
-  }
-  // Multiply the weight of the set of samples by a factor k
-  Self &operator*=(const VType &stat) {
-    this->Stat1<VType, NumType>::operator*=(stat);
-    return (*this);
-  }
-  // Merge statistics from two sample sets.
-  Self operator+(const Self &stat) const { return Self(*this) += stat; }
-  // Multiply the weight of the set of samples by a factor k
-  Self operator*(const VType &k) const { return Self(*this) *= k; }
-
-  // Return the maximal value in this sample set
-  VType Max() const { return max_; }
-  // Return the minimal value in this sample set
-  VType Min() const { return min_; }
-
- private:
-  // The - operation makes no sense with Min/Max
-  // unless we keep the full list of values (but we don't)
-  // make it private, and let it undefined so nobody can call it
-  Self &operator-=(const Self &stat);  // senseless. let it undefined.
-
-  // The operation opposite to -
-  Self operator-(const Self &stat) const;  // senseless. let it undefined.
-
-  // Let i be the index of the samples provided (using +=)
-  // and weight[i],value[i] be the data of sample #i
-  // then the variables have the following meaning:
-  VType max_;  // max of value[i]
-  VType min_;  // min of value[i]
-};
-
-// Useful printing function
-template <typename VType, typename NumType>
-std::ostream &operator<<(std::ostream &out,
-                         const Stat1MinMax<VType, NumType> &s) {
-  out << "{ avg = " << s.Mean() << " std = " << s.StdDev()
-      << " nsamples = " << s.NumSamples() << " min = " << s.Min()
-      << " max = " << s.Max() << "}";
-  return out;
-}
-}  // end namespace benchmark
-
-#endif  // BENCHMARK_STAT_H_
diff --git a/src/statistics.cc b/src/statistics.cc
new file mode 100644
index 0000000..5932ad4
--- /dev/null
+++ b/src/statistics.cc
@@ -0,0 +1,175 @@
+// Copyright 2016 Ismael Jimenez Martinez. All rights reserved.
+// Copyright 2017 Roman Lebedev. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "benchmark/benchmark.h"
+
+#include <algorithm>
+#include <cmath>
+#include <string>
+#include <vector>
+#include <numeric>
+#include "check.h"
+#include "statistics.h"
+
+namespace benchmark {
+
+auto StatisticsSum = [](const std::vector<double>& v) {
+  return std::accumulate(v.begin(), v.end(), 0.0);
+};
+
+double StatisticsMean(const std::vector<double>& v) {
+  if (v.size() == 0) return 0.0;
+  return StatisticsSum(v) * (1.0 / v.size());
+}
+
+double StatisticsMedian(const std::vector<double>& v) {
+  if (v.size() < 3) return StatisticsMean(v);
+  std::vector<double> partial;
+  // we need roundDown(count/2)+1 slots
+  partial.resize(1 + (v.size() / 2));
+  std::partial_sort_copy(v.begin(), v.end(), partial.begin(), partial.end());
+  // did we have odd number of samples?
+  // if yes, then the last element of partially-sorted vector is the median
+  // it no, then the average of the last two elements is the median
+  if(v.size() % 2 == 1)
+    return partial.back();
+  return (partial[partial.size() - 2] + partial[partial.size() - 1]) / 2.0;
+}
+
+// Return the sum of the squares of this sample set
+auto SumSquares = [](const std::vector<double>& v) {
+  return std::inner_product(v.begin(), v.end(), v.begin(), 0.0);
+};
+
+auto Sqr = [](const double dat) { return dat * dat; };
+auto Sqrt = [](const double dat) {
+  // Avoid NaN due to imprecision in the calculations
+  if (dat < 0.0) return 0.0;
+  return std::sqrt(dat);
+};
+
+double StatisticsStdDev(const std::vector<double>& v) {
+  const auto mean = StatisticsMean(v);
+  if (v.size() == 0) return mean;
+
+  // Sample standard deviation is undefined for n = 1
+  if (v.size() == 1)
+    return 0.0;
+
+  const double avg_squares = SumSquares(v) * (1.0 / v.size());
+  return Sqrt(v.size() / (v.size() - 1.0) * (avg_squares - Sqr(mean)));
+}
+
+std::vector<BenchmarkReporter::Run> ComputeStats(
+    const std::vector<BenchmarkReporter::Run>& reports) {
+  typedef BenchmarkReporter::Run Run;
+  std::vector<Run> results;
+
+  auto error_count =
+      std::count_if(reports.begin(), reports.end(),
+                    [](Run const& run) { return run.error_occurred; });
+
+  if (reports.size() - error_count < 2) {
+    // We don't report aggregated data if there was a single run.
+    return results;
+  }
+
+  // Accumulators.
+  std::vector<double> real_accumulated_time_stat;
+  std::vector<double> cpu_accumulated_time_stat;
+  std::vector<double> bytes_per_second_stat;
+  std::vector<double> items_per_second_stat;
+
+  real_accumulated_time_stat.reserve(reports.size());
+  cpu_accumulated_time_stat.reserve(reports.size());
+  bytes_per_second_stat.reserve(reports.size());
+  items_per_second_stat.reserve(reports.size());
+
+  // All repetitions should be run with the same number of iterations so we
+  // can take this information from the first benchmark.
+  int64_t const run_iterations = reports.front().iterations;
+  // create stats for user counters
+  struct CounterStat {
+    Counter c;
+    std::vector<double> s;
+  };
+  std::map< std::string, CounterStat > counter_stats;
+  for(Run const& r : reports) {
+    for(auto const& cnt : r.counters) {
+      auto it = counter_stats.find(cnt.first);
+      if(it == counter_stats.end()) {
+        counter_stats.insert({cnt.first, {cnt.second, std::vector<double>{}}});
+        it = counter_stats.find(cnt.first);
+        it->second.s.reserve(reports.size());
+      } else {
+        CHECK_EQ(counter_stats[cnt.first].c.flags, cnt.second.flags);
+      }
+    }
+  }
+
+  // Populate the accumulators.
+  for (Run const& run : reports) {
+    CHECK_EQ(reports[0].benchmark_name, run.benchmark_name);
+    CHECK_EQ(run_iterations, run.iterations);
+    if (run.error_occurred) continue;
+    real_accumulated_time_stat.emplace_back(run.real_accumulated_time);
+    cpu_accumulated_time_stat.emplace_back(run.cpu_accumulated_time);
+    items_per_second_stat.emplace_back(run.items_per_second);
+    bytes_per_second_stat.emplace_back(run.bytes_per_second);
+    // user counters
+    for(auto const& cnt : run.counters) {
+      auto it = counter_stats.find(cnt.first);
+      CHECK_NE(it, counter_stats.end());
+      it->second.s.emplace_back(cnt.second);
+    }
+  }
+
+  // Only add label if it is same for all runs
+  std::string report_label = reports[0].report_label;
+  for (std::size_t i = 1; i < reports.size(); i++) {
+    if (reports[i].report_label != report_label) {
+      report_label = "";
+      break;
+    }
+  }
+
+  for(const auto& Stat : *reports[0].statistics) {
+    // Get the data from the accumulator to BenchmarkReporter::Run's.
+    Run data;
+    data.benchmark_name = reports[0].benchmark_name + "_" + Stat.name_;
+    data.report_label = report_label;
+    data.iterations = run_iterations;
+
+    data.real_accumulated_time = Stat.compute_(real_accumulated_time_stat);
+    data.cpu_accumulated_time = Stat.compute_(cpu_accumulated_time_stat);
+    data.bytes_per_second = Stat.compute_(bytes_per_second_stat);
+    data.items_per_second = Stat.compute_(items_per_second_stat);
+
+    data.time_unit = reports[0].time_unit;
+
+    // user counters
+    for(auto const& kv : counter_stats) {
+      const auto uc_stat = Stat.compute_(kv.second.s);
+      auto c = Counter(uc_stat, counter_stats[kv.first].c.flags);
+      data.counters[kv.first] = c;
+    }
+
+    results.push_back(data);
+  }
+
+  return results;
+}
+
+}  // end namespace benchmark
diff --git a/src/statistics.h b/src/statistics.h
new file mode 100644
index 0000000..7eccc85
--- /dev/null
+++ b/src/statistics.h
@@ -0,0 +1,37 @@
+// Copyright 2016 Ismael Jimenez Martinez. All rights reserved.
+// Copyright 2017 Roman Lebedev. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef STATISTICS_H_
+#define STATISTICS_H_
+
+#include <vector>
+
+#include "benchmark/benchmark.h"
+
+namespace benchmark {
+
+// Return a vector containing the mean, median and standard devation information
+// (and any user-specified info) for the specified list of reports. If 'reports'
+// contains less than two non-errored runs an empty vector is returned
+std::vector<BenchmarkReporter::Run> ComputeStats(
+    const std::vector<BenchmarkReporter::Run>& reports);
+
+double StatisticsMean(const std::vector<double>& v);
+double StatisticsMedian(const std::vector<double>& v);
+double StatisticsStdDev(const std::vector<double>& v);
+
+}  // end namespace benchmark
+
+#endif  // STATISTICS_H_
diff --git a/src/string_util.cc b/src/string_util.cc
index cd4e7cf..29edb2a 100644
--- a/src/string_util.cc
+++ b/src/string_util.cc
@@ -27,8 +27,6 @@
 
 static const int64_t kUnitsSize = arraysize(kBigSIUnits);
 
-}  // end anonymous namespace
-
 void ToExponentAndMantissa(double val, double thresh, int precision,
                            double one_k, std::string* mantissa,
                            int64_t* exponent) {
@@ -100,14 +98,16 @@
 }
 
 std::string ToBinaryStringFullySpecified(double value, double threshold,
-                                         int precision) {
+                                         int precision, double one_k = 1024.0) {
   std::string mantissa;
   int64_t exponent;
-  ToExponentAndMantissa(value, threshold, precision, 1024.0, &mantissa,
+  ToExponentAndMantissa(value, threshold, precision, one_k, &mantissa,
                         &exponent);
   return mantissa + ExponentToPrefix(exponent, false);
 }
 
+}  // end namespace
+
 void AppendHumanReadable(int n, std::string* str) {
   std::stringstream ss;
   // Round down to the nearest SI prefix.
@@ -115,11 +115,11 @@
   *str += ss.str();
 }
 
-std::string HumanReadableNumber(double n) {
+std::string HumanReadableNumber(double n, double one_k) {
   // 1.1 means that figures up to 1.1k should be shown with the next unit down;
   // this softens edge effects.
   // 1 means that we should show one decimal place of precision.
-  return ToBinaryStringFullySpecified(n, 1.1, 1);
+  return ToBinaryStringFullySpecified(n, 1.1, 1, one_k);
 }
 
 std::string StringPrintFImp(const char* msg, va_list args) {
diff --git a/src/string_util.h b/src/string_util.h
index 0b190b9..c3d53bf 100644
--- a/src/string_util.h
+++ b/src/string_util.h
@@ -10,7 +10,7 @@
 
 void AppendHumanReadable(int n, std::string* str);
 
-std::string HumanReadableNumber(double n);
+std::string HumanReadableNumber(double n, double one_k = 1024.0);
 
 std::string StringPrintF(const char* format, ...);
 
diff --git a/src/sysinfo.cc b/src/sysinfo.cc
index 7feb79e..2520ad5 100644
--- a/src/sysinfo.cc
+++ b/src/sysinfo.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "sysinfo.h"
 #include "internal_macros.h"
 
 #ifdef BENCHMARK_OS_WINDOWS
@@ -25,21 +24,29 @@
 #include <sys/time.h>
 #include <sys/types.h>  // this header must be included before 'sys/sysctl.h' to avoid compilation error on FreeBSD
 #include <unistd.h>
-#if defined BENCHMARK_OS_FREEBSD || defined BENCHMARK_OS_MACOSX
+#if defined BENCHMARK_OS_FREEBSD || defined BENCHMARK_OS_MACOSX || \
+    defined BENCHMARK_OS_NETBSD
+#define BENCHMARK_HAS_SYSCTL
 #include <sys/sysctl.h>
 #endif
 #endif
 
+#include <algorithm>
+#include <array>
+#include <bitset>
 #include <cerrno>
+#include <climits>
 #include <cstdint>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
+#include <fstream>
 #include <iostream>
+#include <iterator>
 #include <limits>
-#include <mutex>
+#include <memory>
+#include <sstream>
 
-#include "arraysize.h"
 #include "check.h"
 #include "cycleclock.h"
 #include "internal_macros.h"
@@ -49,214 +56,431 @@
 
 namespace benchmark {
 namespace {
-std::once_flag cpuinfo_init;
-double cpuinfo_cycles_per_second = 1.0;
-int cpuinfo_num_cpus = 1;  // Conservative guess
 
-#if !defined BENCHMARK_OS_MACOSX
-const int64_t estimate_time_ms = 1000;
+void PrintImp(std::ostream& out) { out << std::endl; }
 
-// Helper function estimates cycles/sec by observing cycles elapsed during
-// sleep(). Using small sleep time decreases accuracy significantly.
-int64_t EstimateCyclesPerSecond() {
-  const int64_t start_ticks = cycleclock::Now();
-  SleepForMilliseconds(estimate_time_ms);
-  return cycleclock::Now() - start_ticks;
+template <class First, class... Rest>
+void PrintImp(std::ostream& out, First&& f, Rest&&... rest) {
+  out << std::forward<First>(f);
+  PrintImp(out, std::forward<Rest>(rest)...);
+}
+
+template <class... Args>
+BENCHMARK_NORETURN void PrintErrorAndDie(Args&&... args) {
+  PrintImp(std::cerr, std::forward<Args>(args)...);
+  std::exit(EXIT_FAILURE);
+}
+
+#ifdef BENCHMARK_HAS_SYSCTL
+
+/// ValueUnion - A type used to correctly alias the byte-for-byte output of
+/// `sysctl` with the result type it's to be interpreted as.
+struct ValueUnion {
+  union DataT {
+    uint32_t uint32_value;
+    uint64_t uint64_value;
+    // For correct aliasing of union members from bytes.
+    char bytes[8];
+  };
+  using DataPtr = std::unique_ptr<DataT, decltype(&std::free)>;
+
+  // The size of the data union member + its trailing array size.
+  size_t Size;
+  DataPtr Buff;
+
+ public:
+  ValueUnion() : Size(0), Buff(nullptr, &std::free) {}
+
+  explicit ValueUnion(size_t BuffSize)
+      : Size(sizeof(DataT) + BuffSize),
+        Buff(::new (std::malloc(Size)) DataT(), &std::free) {}
+
+  ValueUnion(ValueUnion&& other) = default;
+
+  explicit operator bool() const { return bool(Buff); }
+
+  char* data() const { return Buff->bytes; }
+
+  std::string GetAsString() const { return std::string(data()); }
+
+  int64_t GetAsInteger() const {
+    if (Size == sizeof(Buff->uint32_value))
+      return static_cast<int32_t>(Buff->uint32_value);
+    else if (Size == sizeof(Buff->uint64_value))
+      return static_cast<int64_t>(Buff->uint64_value);
+    BENCHMARK_UNREACHABLE();
+  }
+
+  uint64_t GetAsUnsigned() const {
+    if (Size == sizeof(Buff->uint32_value))
+      return Buff->uint32_value;
+    else if (Size == sizeof(Buff->uint64_value))
+      return Buff->uint64_value;
+    BENCHMARK_UNREACHABLE();
+  }
+
+  template <class T, int N>
+  std::array<T, N> GetAsArray() {
+    const int ArrSize = sizeof(T) * N;
+    CHECK_LE(ArrSize, Size);
+    std::array<T, N> Arr;
+    std::memcpy(Arr.data(), data(), ArrSize);
+    return Arr;
+  }
+};
+
+ValueUnion GetSysctlImp(std::string const& Name) {
+  size_t CurBuffSize = 0;
+  if (sysctlbyname(Name.c_str(), nullptr, &CurBuffSize, nullptr, 0) == -1)
+    return ValueUnion();
+
+  ValueUnion buff(CurBuffSize);
+  if (sysctlbyname(Name.c_str(), buff.data(), &buff.Size, nullptr, 0) == 0)
+    return buff;
+  return ValueUnion();
+}
+
+BENCHMARK_MAYBE_UNUSED
+bool GetSysctl(std::string const& Name, std::string* Out) {
+  Out->clear();
+  auto Buff = GetSysctlImp(Name);
+  if (!Buff) return false;
+  Out->assign(Buff.data());
+  return true;
+}
+
+template <class Tp,
+          class = typename std::enable_if<std::is_integral<Tp>::value>::type>
+bool GetSysctl(std::string const& Name, Tp* Out) {
+  *Out = 0;
+  auto Buff = GetSysctlImp(Name);
+  if (!Buff) return false;
+  *Out = static_cast<Tp>(Buff.GetAsUnsigned());
+  return true;
+}
+
+template <class Tp, size_t N>
+bool GetSysctl(std::string const& Name, std::array<Tp, N>* Out) {
+  auto Buff = GetSysctlImp(Name);
+  if (!Buff) return false;
+  *Out = Buff.GetAsArray<Tp, N>();
+  return true;
 }
 #endif
 
-#if defined BENCHMARK_OS_LINUX || defined BENCHMARK_OS_CYGWIN
-// Helper function for reading an int from a file. Returns true if successful
-// and the memory location pointed to by value is set to the value read.
-bool ReadIntFromFile(const char* file, long* value) {
-  bool ret = false;
-  int fd = open(file, O_RDONLY);
-  if (fd != -1) {
-    char line[1024];
-    char* err;
-    memset(line, '\0', sizeof(line));
-    ssize_t read_err = read(fd, line, sizeof(line) - 1);
-    ((void)read_err); // prevent unused warning
-    CHECK(read_err >= 0);
-    const long temp_value = strtol(line, &err, 10);
-    if (line[0] != '\0' && (*err == '\n' || *err == '\0')) {
-      *value = temp_value;
-      ret = true;
+template <class ArgT>
+bool ReadFromFile(std::string const& fname, ArgT* arg) {
+  *arg = ArgT();
+  std::ifstream f(fname.c_str());
+  if (!f.is_open()) return false;
+  f >> *arg;
+  return f.good();
+}
+
+bool CpuScalingEnabled(int num_cpus) {
+  // We don't have a valid CPU count, so don't even bother.
+  if (num_cpus <= 0) return false;
+#ifndef BENCHMARK_OS_WINDOWS
+  // On Linux, the CPUfreq subsystem exposes CPU information as files on the
+  // local file system. If reading the exported files fails, then we may not be
+  // running on Linux, so we silently ignore all the read errors.
+  std::string res;
+  for (int cpu = 0; cpu < num_cpus; ++cpu) {
+    std::string governor_file =
+        StrCat("/sys/devices/system/cpu/cpu", cpu, "/cpufreq/scaling_governor");
+    if (ReadFromFile(governor_file, &res) && res != "performance") return true;
+  }
+#endif
+  return false;
+}
+
+int CountSetBitsInCPUMap(std::string Val) {
+  auto CountBits = [](std::string Part) {
+    using CPUMask = std::bitset<sizeof(std::uintptr_t) * CHAR_BIT>;
+    Part = "0x" + Part;
+    CPUMask Mask(std::stoul(Part, nullptr, 16));
+    return static_cast<int>(Mask.count());
+  };
+  size_t Pos;
+  int total = 0;
+  while ((Pos = Val.find(',')) != std::string::npos) {
+    total += CountBits(Val.substr(0, Pos));
+    Val = Val.substr(Pos + 1);
+  }
+  if (!Val.empty()) {
+    total += CountBits(Val);
+  }
+  return total;
+}
+
+BENCHMARK_MAYBE_UNUSED
+std::vector<CPUInfo::CacheInfo> GetCacheSizesFromKVFS() {
+  std::vector<CPUInfo::CacheInfo> res;
+  std::string dir = "/sys/devices/system/cpu/cpu0/cache/";
+  int Idx = 0;
+  while (true) {
+    CPUInfo::CacheInfo info;
+    std::string FPath = StrCat(dir, "index", Idx++, "/");
+    std::ifstream f(StrCat(FPath, "size").c_str());
+    if (!f.is_open()) break;
+    std::string suffix;
+    f >> info.size;
+    if (f.fail())
+      PrintErrorAndDie("Failed while reading file '", FPath, "size'");
+    if (f.good()) {
+      f >> suffix;
+      if (f.bad())
+        PrintErrorAndDie(
+            "Invalid cache size format: failed to read size suffix");
+      else if (f && suffix != "K")
+        PrintErrorAndDie("Invalid cache size format: Expected bytes ", suffix);
+      else if (suffix == "K")
+        info.size *= 1000;
     }
-    close(fd);
+    if (!ReadFromFile(StrCat(FPath, "type"), &info.type))
+      PrintErrorAndDie("Failed to read from file ", FPath, "type");
+    if (!ReadFromFile(StrCat(FPath, "level"), &info.level))
+      PrintErrorAndDie("Failed to read from file ", FPath, "level");
+    std::string map_str;
+    if (!ReadFromFile(StrCat(FPath, "shared_cpu_map"), &map_str))
+      PrintErrorAndDie("Failed to read from file ", FPath, "shared_cpu_map");
+    info.num_sharing = CountSetBitsInCPUMap(map_str);
+    res.push_back(info);
   }
-  return ret;
+
+  return res;
+}
+
+#ifdef BENCHMARK_OS_MACOSX
+std::vector<CPUInfo::CacheInfo> GetCacheSizesMacOSX() {
+  std::vector<CPUInfo::CacheInfo> res;
+  std::array<uint64_t, 4> CacheCounts{{0, 0, 0, 0}};
+  GetSysctl("hw.cacheconfig", &CacheCounts);
+
+  struct {
+    std::string name;
+    std::string type;
+    int level;
+    size_t num_sharing;
+  } Cases[] = {{"hw.l1dcachesize", "Data", 1, CacheCounts[1]},
+               {"hw.l1icachesize", "Instruction", 1, CacheCounts[1]},
+               {"hw.l2cachesize", "Unified", 2, CacheCounts[2]},
+               {"hw.l3cachesize", "Unified", 3, CacheCounts[3]}};
+  for (auto& C : Cases) {
+    int val;
+    if (!GetSysctl(C.name, &val)) continue;
+    CPUInfo::CacheInfo info;
+    info.type = C.type;
+    info.level = C.level;
+    info.size = val;
+    info.num_sharing = static_cast<int>(C.num_sharing);
+    res.push_back(std::move(info));
+  }
+  return res;
+}
+#elif defined(BENCHMARK_OS_WINDOWS)
+std::vector<CPUInfo::CacheInfo> GetCacheSizesWindows() {
+  std::vector<CPUInfo::CacheInfo> res;
+  DWORD buffer_size = 0;
+  using PInfo = SYSTEM_LOGICAL_PROCESSOR_INFORMATION;
+  using CInfo = CACHE_DESCRIPTOR;
+
+  using UPtr = std::unique_ptr<PInfo, decltype(&std::free)>;
+  GetLogicalProcessorInformation(nullptr, &buffer_size);
+  UPtr buff((PInfo*)malloc(buffer_size), &std::free);
+  if (!GetLogicalProcessorInformation(buff.get(), &buffer_size))
+    PrintErrorAndDie("Failed during call to GetLogicalProcessorInformation: ",
+                     GetLastError());
+
+  PInfo* it = buff.get();
+  PInfo* end = buff.get() + (buffer_size / sizeof(PInfo));
+
+  for (; it != end; ++it) {
+    if (it->Relationship != RelationCache) continue;
+    using BitSet = std::bitset<sizeof(ULONG_PTR) * CHAR_BIT>;
+    BitSet B(it->ProcessorMask);
+    // To prevent duplicates, only consider caches where CPU 0 is specified
+    if (!B.test(0)) continue;
+    CInfo* Cache = &it->Cache;
+    CPUInfo::CacheInfo C;
+    C.num_sharing = B.count();
+    C.level = Cache->Level;
+    C.size = Cache->Size;
+    switch (Cache->Type) {
+      case CacheUnified:
+        C.type = "Unified";
+        break;
+      case CacheInstruction:
+        C.type = "Instruction";
+        break;
+      case CacheData:
+        C.type = "Data";
+        break;
+      case CacheTrace:
+        C.type = "Trace";
+        break;
+      default:
+        C.type = "Unknown";
+        break;
+    }
+    res.push_back(C);
+  }
+  return res;
 }
 #endif
 
-#if defined BENCHMARK_OS_LINUX || defined BENCHMARK_OS_CYGWIN
-static std::string convertToLowerCase(std::string s) {
-  for (auto& ch : s)
-    ch = std::tolower(ch);
-  return s;
-}
-static bool startsWithKey(std::string Value, std::string Key,
-                          bool IgnoreCase = true) {
-  if (IgnoreCase) {
-    Key = convertToLowerCase(std::move(Key));
-    Value = convertToLowerCase(std::move(Value));
-  }
-  return Value.compare(0, Key.size(), Key) == 0;
-}
+std::vector<CPUInfo::CacheInfo> GetCacheSizes() {
+#ifdef BENCHMARK_OS_MACOSX
+  return GetCacheSizesMacOSX();
+#elif defined(BENCHMARK_OS_WINDOWS)
+  return GetCacheSizesWindows();
+#else
+  return GetCacheSizesFromKVFS();
 #endif
+}
 
-void InitializeSystemInfo() {
+int GetNumCPUs() {
+#ifdef BENCHMARK_HAS_SYSCTL
+  int NumCPU = -1;
+  if (GetSysctl("hw.ncpu", &NumCPU)) return NumCPU;
+  fprintf(stderr, "Err: %s\n", strerror(errno));
+  std::exit(EXIT_FAILURE);
+#elif defined(BENCHMARK_OS_WINDOWS)
+  SYSTEM_INFO sysinfo;
+  // Use memset as opposed to = {} to avoid GCC missing initializer false
+  // positives.
+  std::memset(&sysinfo, 0, sizeof(SYSTEM_INFO));
+  GetSystemInfo(&sysinfo);
+  return sysinfo.dwNumberOfProcessors;  // number of logical
+                                        // processors in the current
+                                        // group
+#else
+  int NumCPUs = 0;
+  int MaxID = -1;
+  std::ifstream f("/proc/cpuinfo");
+  if (!f.is_open()) {
+    std::cerr << "failed to open /proc/cpuinfo\n";
+    return -1;
+  }
+  const std::string Key = "processor";
+  std::string ln;
+  while (std::getline(f, ln)) {
+    if (ln.empty()) continue;
+    size_t SplitIdx = ln.find(':');
+    std::string value;
+    if (SplitIdx != std::string::npos) value = ln.substr(SplitIdx + 1);
+    if (ln.size() >= Key.size() && ln.compare(0, Key.size(), Key) == 0) {
+      NumCPUs++;
+      if (!value.empty()) {
+        int CurID = std::stoi(value);
+        MaxID = std::max(CurID, MaxID);
+      }
+    }
+  }
+  if (f.bad()) {
+    std::cerr << "Failure reading /proc/cpuinfo\n";
+    return -1;
+  }
+  if (!f.eof()) {
+    std::cerr << "Failed to read to end of /proc/cpuinfo\n";
+    return -1;
+  }
+  f.close();
+
+  if ((MaxID + 1) != NumCPUs) {
+    fprintf(stderr,
+            "CPU ID assignments in /proc/cpuinfo seem messed up."
+            " This is usually caused by a bad BIOS.\n");
+  }
+  return NumCPUs;
+#endif
+  BENCHMARK_UNREACHABLE();
+}
+
+double GetCPUCyclesPerSecond() {
 #if defined BENCHMARK_OS_LINUX || defined BENCHMARK_OS_CYGWIN
-  char line[1024];
-  char* err;
   long freq;
 
-  bool saw_mhz = false;
-
   // If the kernel is exporting the tsc frequency use that. There are issues
   // where cpuinfo_max_freq cannot be relied on because the BIOS may be
   // exporintg an invalid p-state (on x86) or p-states may be used to put the
   // processor in a new mode (turbo mode). Essentially, those frequencies
   // cannot always be relied upon. The same reasons apply to /proc/cpuinfo as
   // well.
-  if (!saw_mhz &&
-      ReadIntFromFile("/sys/devices/system/cpu/cpu0/tsc_freq_khz", &freq)) {
+  if (ReadFromFile("/sys/devices/system/cpu/cpu0/tsc_freq_khz", &freq)
+      // If CPU scaling is in effect, we want to use the *maximum* frequency,
+      // not whatever CPU speed some random processor happens to be using now.
+      || ReadFromFile("/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq",
+                      &freq)) {
     // The value is in kHz (as the file name suggests).  For example, on a
     // 2GHz warpstation, the file contains the value "2000000".
-    cpuinfo_cycles_per_second = freq * 1000.0;
-    saw_mhz = true;
+    return freq * 1000.0;
   }
 
-  // If CPU scaling is in effect, we want to use the *maximum* frequency,
-  // not whatever CPU speed some random processor happens to be using now.
-  if (!saw_mhz &&
-      ReadIntFromFile("/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq",
-                      &freq)) {
-    // The value is in kHz.  For example, on a 2GHz warpstation, the file
-    // contains the value "2000000".
-    cpuinfo_cycles_per_second = freq * 1000.0;
-    saw_mhz = true;
+  const double error_value = -1;
+  double bogo_clock = error_value;
+
+  std::ifstream f("/proc/cpuinfo");
+  if (!f.is_open()) {
+    std::cerr << "failed to open /proc/cpuinfo\n";
+    return error_value;
   }
 
-  // Read /proc/cpuinfo for other values, and if there is no cpuinfo_max_freq.
-  const char* pname = "/proc/cpuinfo";
-  int fd = open(pname, O_RDONLY);
-  if (fd == -1) {
-    perror(pname);
-    if (!saw_mhz) {
-      cpuinfo_cycles_per_second =
-          static_cast<double>(EstimateCyclesPerSecond());
-    }
-    return;
-  }
+  auto startsWithKey = [](std::string const& Value, std::string const& Key) {
+    if (Key.size() > Value.size()) return false;
+    auto Cmp = [&](char X, char Y) {
+      return std::tolower(X) == std::tolower(Y);
+    };
+    return std::equal(Key.begin(), Key.end(), Value.begin(), Cmp);
+  };
 
-  double bogo_clock = 1.0;
-  bool saw_bogo = false;
-  long max_cpu_id = 0;
-  int num_cpus = 0;
-  line[0] = line[1] = '\0';
-  size_t chars_read = 0;
-  do {  // we'll exit when the last read didn't read anything
-    // Move the next line to the beginning of the buffer
-    const size_t oldlinelen = strlen(line);
-    if (sizeof(line) == oldlinelen + 1)  // oldlinelen took up entire line
-      line[0] = '\0';
-    else  // still other lines left to save
-      memmove(line, line + oldlinelen + 1, sizeof(line) - (oldlinelen + 1));
-    // Terminate the new line, reading more if we can't find the newline
-    char* newline = strchr(line, '\n');
-    if (newline == nullptr) {
-      const size_t linelen = strlen(line);
-      const size_t bytes_to_read = sizeof(line) - 1 - linelen;
-      CHECK(bytes_to_read > 0);  // because the memmove recovered >=1 bytes
-      chars_read = read(fd, line + linelen, bytes_to_read);
-      line[linelen + chars_read] = '\0';
-      newline = strchr(line, '\n');
-    }
-    if (newline != nullptr) *newline = '\0';
-
+  std::string ln;
+  while (std::getline(f, ln)) {
+    if (ln.empty()) continue;
+    size_t SplitIdx = ln.find(':');
+    std::string value;
+    if (SplitIdx != std::string::npos) value = ln.substr(SplitIdx + 1);
     // When parsing the "cpu MHz" and "bogomips" (fallback) entries, we only
     // accept postive values. Some environments (virtual machines) report zero,
     // which would cause infinite looping in WallTime_Init.
-    if (!saw_mhz && startsWithKey(line, "cpu MHz")) {
-      const char* freqstr = strchr(line, ':');
-      if (freqstr) {
-        cpuinfo_cycles_per_second = strtod(freqstr + 1, &err) * 1000000.0;
-        if (freqstr[1] != '\0' && *err == '\0' && cpuinfo_cycles_per_second > 0)
-          saw_mhz = true;
+    if (startsWithKey(ln, "cpu MHz")) {
+      if (!value.empty()) {
+        double cycles_per_second = std::stod(value) * 1000000.0;
+        if (cycles_per_second > 0) return cycles_per_second;
       }
-    } else if (startsWithKey(line, "bogomips")) {
-      const char* freqstr = strchr(line, ':');
-      if (freqstr) {
-        bogo_clock = strtod(freqstr + 1, &err) * 1000000.0;
-        if (freqstr[1] != '\0' && *err == '\0' && bogo_clock > 0)
-          saw_bogo = true;
+    } else if (startsWithKey(ln, "bogomips")) {
+      if (!value.empty()) {
+        bogo_clock = std::stod(value) * 1000000.0;
+        if (bogo_clock < 0.0) bogo_clock = error_value;
       }
-    } else if (startsWithKey(line, "processor", /*IgnoreCase*/false)) {
-      // The above comparison is case-sensitive because ARM kernels often
-      // include a "Processor" line that tells you about the CPU, distinct
-      // from the usual "processor" lines that give you CPU ids. No current
-      // Linux architecture is using "Processor" for CPU ids.
-      num_cpus++;  // count up every time we see an "processor :" entry
-      const char* id_str = strchr(line, ':');
-      if (id_str) {
-        const long cpu_id = strtol(id_str + 1, &err, 10);
-        if (id_str[1] != '\0' && *err == '\0' && max_cpu_id < cpu_id)
-          max_cpu_id = cpu_id;
-      }
-    }
-  } while (chars_read > 0);
-  close(fd);
-
-  if (!saw_mhz) {
-    if (saw_bogo) {
-      // If we didn't find anything better, we'll use bogomips, but
-      // we're not happy about it.
-      cpuinfo_cycles_per_second = bogo_clock;
-    } else {
-      // If we don't even have bogomips, we'll use the slow estimation.
-      cpuinfo_cycles_per_second =
-          static_cast<double>(EstimateCyclesPerSecond());
     }
   }
-  if (num_cpus == 0) {
-    fprintf(stderr, "Failed to read num. CPUs correctly from /proc/cpuinfo\n");
-  } else {
-    if ((max_cpu_id + 1) != num_cpus) {
-      fprintf(stderr,
-              "CPU ID assignments in /proc/cpuinfo seem messed up."
-              " This is usually caused by a bad BIOS.\n");
-    }
-    cpuinfo_num_cpus = num_cpus;
+  if (f.bad()) {
+    std::cerr << "Failure reading /proc/cpuinfo\n";
+    return error_value;
   }
+  if (!f.eof()) {
+    std::cerr << "Failed to read to end of /proc/cpuinfo\n";
+    return error_value;
+  }
+  f.close();
+  // If we found the bogomips clock, but nothing better, we'll use it (but
+  // we're not happy about it); otherwise, fallback to the rough estimation
+  // below.
+  if (bogo_clock >= 0.0) return bogo_clock;
 
-#elif defined BENCHMARK_OS_FREEBSD
-// For this sysctl to work, the machine must be configured without
-// SMP, APIC, or APM support.  hz should be 64-bit in freebsd 7.0
-// and later.  Before that, it's a 32-bit quantity (and gives the
-// wrong answer on machines faster than 2^32 Hz).  See
-//  http://lists.freebsd.org/pipermail/freebsd-i386/2004-November/001846.html
-// But also compare FreeBSD 7.0:
-//  http://fxr.watson.org/fxr/source/i386/i386/tsc.c?v=RELENG70#L223
-//  231         error = sysctl_handle_quad(oidp, &freq, 0, req);
-// To FreeBSD 6.3 (it's the same in 6-STABLE):
-//  http://fxr.watson.org/fxr/source/i386/i386/tsc.c?v=RELENG6#L131
-//  139         error = sysctl_handle_int(oidp, &freq, sizeof(freq), req);
-#if __FreeBSD__ >= 7
-  uint64_t hz = 0;
+#elif defined BENCHMARK_HAS_SYSCTL
+  constexpr auto* FreqStr =
+#if defined(BENCHMARK_OS_FREEBSD) || defined(BENCHMARK_OS_NETBSD)
+      "machdep.tsc_freq";
 #else
-  unsigned int hz = 0;
+      "hw.cpufrequency";
 #endif
-  size_t sz = sizeof(hz);
-  const char* sysctl_path = "machdep.tsc_freq";
-  if (sysctlbyname(sysctl_path, &hz, &sz, nullptr, 0) != 0) {
-    fprintf(stderr, "Unable to determine clock rate from sysctl: %s: %s\n",
-            sysctl_path, strerror(errno));
-    cpuinfo_cycles_per_second = static_cast<double>(EstimateCyclesPerSecond());
-  } else {
-    cpuinfo_cycles_per_second = hz;
-  }
-// TODO: also figure out cpuinfo_num_cpus
+  unsigned long long hz = 0;
+  if (GetSysctl(FreqStr, &hz)) return hz;
+
+  fprintf(stderr, "Unable to determine clock rate from sysctl: %s: %s\n",
+          FreqStr, strerror(errno));
 
 #elif defined BENCHMARK_OS_WINDOWS
   // In NT, read MHz from the registry. If we fail to do so or we're in win9x
@@ -267,89 +491,27 @@
           SHGetValueA(HKEY_LOCAL_MACHINE,
                       "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0",
                       "~MHz", nullptr, &data, &data_size)))
-    cpuinfo_cycles_per_second =
-        static_cast<double>((int64_t)data * (int64_t)(1000 * 1000));  // was mhz
-  else
-    cpuinfo_cycles_per_second = static_cast<double>(EstimateCyclesPerSecond());
-
-  SYSTEM_INFO sysinfo;
-  // Use memset as opposed to = {} to avoid GCC missing initializer false
-  // positives.
-  std::memset(&sysinfo, 0, sizeof(SYSTEM_INFO));
-  GetSystemInfo(&sysinfo);
-  cpuinfo_num_cpus = sysinfo.dwNumberOfProcessors;  // number of logical
-                                                    // processors in the current
-                                                    // group
-
-#elif defined BENCHMARK_OS_MACOSX
-  int32_t num_cpus = 0;
-  size_t size = sizeof(num_cpus);
-  if (::sysctlbyname("hw.ncpu", &num_cpus, &size, nullptr, 0) == 0 &&
-      (size == sizeof(num_cpus))) {
-    cpuinfo_num_cpus = num_cpus;
-  } else {
-    fprintf(stderr, "%s\n", strerror(errno));
-    std::exit(EXIT_FAILURE);
-  }
-  int64_t cpu_freq = 0;
-  size = sizeof(cpu_freq);
-  if (::sysctlbyname("hw.cpufrequency", &cpu_freq, &size, nullptr, 0) == 0 &&
-      (size == sizeof(cpu_freq))) {
-    cpuinfo_cycles_per_second = cpu_freq;
-  } else {
-    #if defined BENCHMARK_OS_IOS
-    fprintf(stderr, "CPU frequency cannot be detected. \n");
-    cpuinfo_cycles_per_second = 0;
-    #else
-    fprintf(stderr, "%s\n", strerror(errno));
-    std::exit(EXIT_FAILURE);
-    #endif
-  }
-#else
-  // Generic cycles per second counter
-  cpuinfo_cycles_per_second = static_cast<double>(EstimateCyclesPerSecond());
+    return static_cast<double>((int64_t)data *
+                               (int64_t)(1000 * 1000));  // was mhz
 #endif
+  // If we've fallen through, attempt to roughly estimate the CPU clock rate.
+  const int estimate_time_ms = 1000;
+  const auto start_ticks = cycleclock::Now();
+  SleepForMilliseconds(estimate_time_ms);
+  return static_cast<double>(cycleclock::Now() - start_ticks);
 }
 
 }  // end namespace
 
-double CyclesPerSecond(void) {
-  std::call_once(cpuinfo_init, InitializeSystemInfo);
-  return cpuinfo_cycles_per_second;
+const CPUInfo& CPUInfo::Get() {
+  static const CPUInfo* info = new CPUInfo();
+  return *info;
 }
 
-int NumCPUs(void) {
-  std::call_once(cpuinfo_init, InitializeSystemInfo);
-  return cpuinfo_num_cpus;
-}
-
-// The ""'s catch people who don't pass in a literal for "str"
-#define strliterallen(str) (sizeof("" str "") - 1)
-
-// Must use a string literal for prefix.
-#define memprefix(str, len, prefix)                       \
-  ((((len) >= strliterallen(prefix)) &&                   \
-    std::memcmp(str, prefix, strliterallen(prefix)) == 0) \
-       ? str + strliterallen(prefix)                      \
-       : nullptr)
-
-bool CpuScalingEnabled() {
-#ifndef BENCHMARK_OS_WINDOWS
-  // On Linux, the CPUfreq subsystem exposes CPU information as files on the
-  // local file system. If reading the exported files fails, then we may not be
-  // running on Linux, so we silently ignore all the read errors.
-  for (int cpu = 0, num_cpus = NumCPUs(); cpu < num_cpus; ++cpu) {
-    std::string governor_file =
-        StrCat("/sys/devices/system/cpu/cpu", cpu, "/cpufreq/scaling_governor");
-    FILE* file = fopen(governor_file.c_str(), "r");
-    if (!file) break;
-    char buff[16];
-    size_t bytes_read = fread(buff, 1, sizeof(buff), file);
-    fclose(file);
-    if (memprefix(buff, bytes_read, "performance") == nullptr) return true;
-  }
-#endif
-  return false;
-}
+CPUInfo::CPUInfo()
+    : num_cpus(GetNumCPUs()),
+      cycles_per_second(GetCPUCyclesPerSecond()),
+      caches(GetCacheSizes()),
+      scaling_enabled(CpuScalingEnabled(num_cpus)) {}
 
 }  // end namespace benchmark
diff --git a/src/sysinfo.h b/src/sysinfo.h
deleted file mode 100644
index c5d9916..0000000
--- a/src/sysinfo.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef BENCHMARK_SYSINFO_H_
-#define BENCHMARK_SYSINFO_H_
-
-namespace benchmark {
-int NumCPUs();
-double CyclesPerSecond();
-bool CpuScalingEnabled();
-}  // end namespace benchmark
-
-#endif  // BENCHMARK_SYSINFO_H_
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index b55612b..efce3ba 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -42,7 +42,6 @@
           ${BENCHMARK_CXX_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
 endmacro(compile_output_test)
 
-
 # Demonstration executable
 compile_benchmark_test(benchmark_test)
 add_test(benchmark benchmark_test --benchmark_min_time=0.01)
@@ -98,6 +97,9 @@
 compile_output_test(reporter_output_test)
 add_test(reporter_output_test reporter_output_test --benchmark_min_time=0.01)
 
+compile_output_test(templated_fixture_test)
+add_test(templated_fixture_test templated_fixture_test --benchmark_min_time=0.01)
+
 compile_output_test(user_counters_test)
 add_test(user_counters_test user_counters_test --benchmark_min_time=0.01)
 
@@ -106,13 +108,20 @@
 
 check_cxx_compiler_flag(-std=c++03 BENCHMARK_HAS_CXX03_FLAG)
 if (BENCHMARK_HAS_CXX03_FLAG)
-  set(CXX03_FLAGS "${CMAKE_CXX_FLAGS}")
-  string(REPLACE "-std=c++11" "-std=c++03" CXX03_FLAGS "${CXX03_FLAGS}")
-  string(REPLACE "-std=c++0x" "-std=c++03" CXX03_FLAGS "${CXX03_FLAGS}")
-
   compile_benchmark_test(cxx03_test)
   set_target_properties(cxx03_test
-      PROPERTIES COMPILE_FLAGS "${CXX03_FLAGS}")
+      PROPERTIES
+      COMPILE_FLAGS "-std=c++03")
+  # libstdc++ provides different definitions within <map> between dialects. When
+  # LTO is enabled and -Werror is specified GCC diagnoses this ODR violation
+  # causing the test to fail to compile. To prevent this we explicitly disable
+  # the warning.
+  check_cxx_compiler_flag(-Wno-odr BENCHMARK_HAS_WNO_ODR)
+  if (BENCHMARK_ENABLE_LTO AND BENCHMARK_HAS_WNO_ODR)
+    set_target_properties(cxx03_test
+        PROPERTIES
+        LINK_FLAGS "-Wno-odr")
+  endif()
   add_test(cxx03 cxx03_test --benchmark_min_time=0.01)
 endif()
 
@@ -125,6 +134,29 @@
 compile_output_test(complexity_test)
 add_test(complexity_benchmark complexity_test --benchmark_min_time=${COMPLEXITY_MIN_TIME})
 
+###############################################################################
+# GoogleTest Unit Tests
+###############################################################################
+
+if (BENCHMARK_ENABLE_GTEST_TESTS)
+  macro(compile_gtest name)
+    add_executable(${name} "${name}.cc")
+    if (TARGET googletest)
+      add_dependencies(${name} googletest)
+    endif()
+    target_link_libraries(${name} benchmark
+        "${GTEST_BOTH_LIBRARIES}" ${CMAKE_THREAD_LIBS_INIT})
+  endmacro(compile_gtest)
+
+  macro(add_gtest name)
+    compile_gtest(${name})
+    add_test(${name} ${name})
+  endmacro()
+
+  add_gtest(statistics_test)
+endif(BENCHMARK_ENABLE_GTEST_TESTS)
+
+
 # Add the coverage command(s)
 if(CMAKE_BUILD_TYPE)
   string(TOLOWER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_LOWER)
diff --git a/test/basic_test.cc b/test/basic_test.cc
index bc1f96d..3348781 100644
--- a/test/basic_test.cc
+++ b/test/basic_test.cc
@@ -4,7 +4,7 @@
 #define BASIC_BENCHMARK_TEST(x) BENCHMARK(x)->Arg(8)->Arg(512)->Arg(8192)
 
 void BM_empty(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
     benchmark::DoNotOptimize(state.iterations());
   }
 }
@@ -12,7 +12,7 @@
 BENCHMARK(BM_empty)->ThreadPerCpu();
 
 void BM_spin_empty(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
     for (int x = 0; x < state.range(0); ++x) {
       benchmark::DoNotOptimize(x);
     }
@@ -25,7 +25,7 @@
   for (int i = 0; i < state.range(0); ++i) {
     benchmark::DoNotOptimize(i);
   }
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
     for (int i = 0; i < state.range(0); ++i) {
       benchmark::DoNotOptimize(i);
     }
@@ -35,7 +35,7 @@
 BASIC_BENCHMARK_TEST(BM_spin_pause_before)->ThreadPerCpu();
 
 void BM_spin_pause_during(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
     state.PauseTiming();
     for (int i = 0; i < state.range(0); ++i) {
       benchmark::DoNotOptimize(i);
@@ -50,7 +50,7 @@
 BASIC_BENCHMARK_TEST(BM_spin_pause_during)->ThreadPerCpu();
 
 void BM_pause_during(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
     state.PauseTiming();
     state.ResumeTiming();
   }
@@ -61,7 +61,7 @@
 BENCHMARK(BM_pause_during)->UseRealTime()->ThreadPerCpu();
 
 void BM_spin_pause_after(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
     for (int i = 0; i < state.range(0); ++i) {
       benchmark::DoNotOptimize(i);
     }
@@ -77,7 +77,7 @@
   for (int i = 0; i < state.range(0); ++i) {
     benchmark::DoNotOptimize(i);
   }
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
     for (int i = 0; i < state.range(0); ++i) {
       benchmark::DoNotOptimize(i);
     }
@@ -90,10 +90,29 @@
 BASIC_BENCHMARK_TEST(BM_spin_pause_before_and_after)->ThreadPerCpu();
 
 void BM_empty_stop_start(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
   }
 }
 BENCHMARK(BM_empty_stop_start);
 BENCHMARK(BM_empty_stop_start)->ThreadPerCpu();
 
-BENCHMARK_MAIN()
+
+void BM_KeepRunning(benchmark::State& state) {
+  size_t iter_count = 0;
+  while (state.KeepRunning()) {
+    ++iter_count;
+  }
+  assert(iter_count == state.max_iterations);
+}
+BENCHMARK(BM_KeepRunning);
+
+void BM_RangedFor(benchmark::State& state) {
+  size_t iter_count = 0;
+  for (auto _ : state) {
+    ++iter_count;
+  }
+  assert(iter_count == state.max_iterations);
+}
+BENCHMARK(BM_RangedFor);
+
+BENCHMARK_MAIN();
diff --git a/test/benchmark_test.cc b/test/benchmark_test.cc
index 7a16466..78802c8 100644
--- a/test/benchmark_test.cc
+++ b/test/benchmark_test.cc
@@ -42,7 +42,7 @@
 
 std::set<int> ConstructRandomSet(int size) {
   std::set<int> s;
-  for (int i = 0; i < size; ++i) s.insert(i);
+  for (int i = 0; i < size; ++i) s.insert(s.end(), i);
   return s;
 }
 
@@ -53,7 +53,7 @@
 
 static void BM_Factorial(benchmark::State& state) {
   int fac_42 = 0;
-  while (state.KeepRunning()) fac_42 = Factorial(8);
+  for (auto _ : state) fac_42 = Factorial(8);
   // Prevent compiler optimizations
   std::stringstream ss;
   ss << fac_42;
@@ -64,7 +64,7 @@
 
 static void BM_CalculatePiRange(benchmark::State& state) {
   double pi = 0.0;
-  while (state.KeepRunning()) pi = CalculatePi(state.range(0));
+  for (auto _ : state) pi = CalculatePi(state.range(0));
   std::stringstream ss;
   ss << pi;
   state.SetLabel(ss.str());
@@ -73,7 +73,7 @@
 
 static void BM_CalculatePi(benchmark::State& state) {
   static const int depth = 1024;
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
     benchmark::DoNotOptimize(CalculatePi(depth));
   }
 }
@@ -82,22 +82,26 @@
 BENCHMARK(BM_CalculatePi)->ThreadPerCpu();
 
 static void BM_SetInsert(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  std::set<int> data;
+  for (auto _ : state) {
     state.PauseTiming();
-    std::set<int> data = ConstructRandomSet(state.range(0));
+    data = ConstructRandomSet(state.range(0));
     state.ResumeTiming();
     for (int j = 0; j < state.range(1); ++j) data.insert(rand());
   }
   state.SetItemsProcessed(state.iterations() * state.range(1));
   state.SetBytesProcessed(state.iterations() * state.range(1) * sizeof(int));
 }
-BENCHMARK(BM_SetInsert)->Ranges({{1 << 10, 8 << 10}, {1, 10}});
+
+// Test many inserts at once to reduce the total iterations needed. Otherwise, the slower,
+// non-timed part of each iteration will make the benchmark take forever.
+BENCHMARK(BM_SetInsert)->Ranges({{1 << 10, 8 << 10}, {128, 512}});
 
 template <typename Container,
           typename ValueType = typename Container::value_type>
 static void BM_Sequential(benchmark::State& state) {
   ValueType v = 42;
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
     Container c;
     for (int i = state.range(0); --i;) c.push_back(v);
   }
@@ -109,14 +113,14 @@
     ->Range(1 << 0, 1 << 10);
 BENCHMARK_TEMPLATE(BM_Sequential, std::list<int>)->Range(1 << 0, 1 << 10);
 // Test the variadic version of BENCHMARK_TEMPLATE in C++11 and beyond.
-#if __cplusplus >= 201103L
+#ifdef BENCHMARK_HAS_CXX11
 BENCHMARK_TEMPLATE(BM_Sequential, std::vector<int>, int)->Arg(512);
 #endif
 
 static void BM_StringCompare(benchmark::State& state) {
   std::string s1(state.range(0), '-');
   std::string s2(state.range(0), '-');
-  while (state.KeepRunning()) benchmark::DoNotOptimize(s1.compare(s2));
+  for (auto _ : state) benchmark::DoNotOptimize(s1.compare(s2));
 }
 BENCHMARK(BM_StringCompare)->Range(1, 1 << 20);
 
@@ -126,7 +130,7 @@
     test_vector = new std::vector<int>();
   }
   int i = 0;
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
     std::lock_guard<std::mutex> l(test_vector_mu);
     if (i % 2 == 0)
       test_vector->push_back(i);
@@ -142,7 +146,7 @@
 
 static void BM_LongTest(benchmark::State& state) {
   double tracker = 0.0;
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
     for (int i = 0; i < state.range(0); ++i)
       benchmark::DoNotOptimize(tracker += i);
   }
@@ -159,7 +163,7 @@
     test_vector = new std::vector<int>(size);
   }
 
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
     for (int i = from; i < to; i++) {
       // No need to lock test_vector_mu as ranges
       // do not overlap between threads.
@@ -179,7 +183,7 @@
   std::chrono::duration<double, std::micro> sleep_duration{
       static_cast<double>(microseconds)};
 
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
     auto start = std::chrono::high_resolution_clock::now();
     // Simulate some useful workload with a sleep
     std::this_thread::sleep_for(
@@ -197,11 +201,11 @@
 BENCHMARK(BM_ManualTiming)->Range(1, 1 << 14)->UseRealTime();
 BENCHMARK(BM_ManualTiming)->Range(1, 1 << 14)->UseManualTime();
 
-#if __cplusplus >= 201103L
+#ifdef BENCHMARK_HAS_CXX11
 
 template <class... Args>
 void BM_with_args(benchmark::State& state, Args&&...) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
   }
 }
 BENCHMARK_CAPTURE(BM_with_args, int_test, 42, 43, 44);
@@ -213,7 +217,7 @@
 }
 BENCHMARK_CAPTURE(BM_non_template_args, basic_test, 0, 0);
 
-#endif  // __cplusplus >= 201103L
+#endif  // BENCHMARK_HAS_CXX11
 
 static void BM_DenseThreadRanges(benchmark::State& st) {
   switch (st.range(0)) {
@@ -237,4 +241,4 @@
 BENCHMARK(BM_DenseThreadRanges)->Arg(2)->DenseThreadRange(1, 4, 2);
 BENCHMARK(BM_DenseThreadRanges)->Arg(3)->DenseThreadRange(5, 14, 3);
 
-BENCHMARK_MAIN()
+BENCHMARK_MAIN();
diff --git a/test/complexity_test.cc b/test/complexity_test.cc
index 62d1154..89dfa58 100644
--- a/test/complexity_test.cc
+++ b/test/complexity_test.cc
@@ -25,8 +25,8 @@
        {"^%bigo_name", MR_Not},  // Assert we we didn't only matched a name.
        {"^%rms_name %rms %rms[ ]*$", MR_Next}});
   AddCases(TC_JSONOut, {{"\"name\": \"%bigo_name\",$"},
-                        {"\"cpu_coefficient\": [0-9]+,$", MR_Next},
-                        {"\"real_coefficient\": [0-9]{1,5},$", MR_Next},
+                        {"\"cpu_coefficient\": %float,$", MR_Next},
+                        {"\"real_coefficient\": %float,$", MR_Next},
                         {"\"big_o\": \"%bigo\",$", MR_Next},
                         {"\"time_unit\": \"ns\"$", MR_Next},
                         {"}", MR_Next},
@@ -46,7 +46,7 @@
 // ========================================================================= //
 
 void BM_Complexity_O1(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
     for (int i = 0; i < 1024; ++i) {
       benchmark::DoNotOptimize(&i);
     }
@@ -94,7 +94,7 @@
   auto v = ConstructRandomVector(state.range(0));
   const int item_not_in_vector =
       state.range(0) * 2;  // Test worst case scenario (item not in vector)
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
     benchmark::DoNotOptimize(std::find(v.begin(), v.end(), item_not_in_vector));
   }
   state.SetComplexityN(state.range(0));
@@ -129,7 +129,7 @@
 
 static void BM_Complexity_O_N_log_N(benchmark::State& state) {
   auto v = ConstructRandomVector(state.range(0));
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
     std::sort(v.begin(), v.end());
   }
   state.SetComplexityN(state.range(0));
diff --git a/test/cxx03_test.cc b/test/cxx03_test.cc
index a79d964..baa9ed9 100644
--- a/test/cxx03_test.cc
+++ b/test/cxx03_test.cc
@@ -8,6 +8,10 @@
 #error C++11 or greater detected. Should be C++03.
 #endif
 
+#ifdef BENCHMARK_HAS_CXX11
+#error C++11 or greater detected by the library. BENCHMARK_HAS_CXX11 is defined.
+#endif
+
 void BM_empty(benchmark::State& state) {
   while (state.KeepRunning()) {
     volatile std::size_t x = state.iterations();
@@ -39,10 +43,21 @@
 BENCHMARK_TEMPLATE(BM_template1, long);
 BENCHMARK_TEMPLATE1(BM_template1, int);
 
+template <class T>
+struct BM_Fixture : public ::benchmark::Fixture {
+};
+
+BENCHMARK_TEMPLATE_F(BM_Fixture, BM_template1, long)(benchmark::State& state) {
+  BM_empty(state);
+}
+BENCHMARK_TEMPLATE1_F(BM_Fixture, BM_template2, int)(benchmark::State& state) {
+  BM_empty(state);
+}
+
 void BM_counters(benchmark::State& state) {
     BM_empty(state);
     state.counters["Foo"] = 2;
 }
 BENCHMARK(BM_counters);
 
-BENCHMARK_MAIN()
+BENCHMARK_MAIN();
diff --git a/test/diagnostics_test.cc b/test/diagnostics_test.cc
index 7aac806..dd64a33 100644
--- a/test/diagnostics_test.cc
+++ b/test/diagnostics_test.cc
@@ -47,7 +47,7 @@
 
   if (called_once == false) try_invalid_pause_resume(state);
 
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
     benchmark::DoNotOptimize(state.iterations());
   }
 
@@ -57,6 +57,22 @@
 }
 BENCHMARK(BM_diagnostic_test);
 
+
+void BM_diagnostic_test_keep_running(benchmark::State& state) {
+  static bool called_once = false;
+
+  if (called_once == false) try_invalid_pause_resume(state);
+
+  while(state.KeepRunning()) {
+    benchmark::DoNotOptimize(state.iterations());
+  }
+
+  if (called_once == false) try_invalid_pause_resume(state);
+
+  called_once = true;
+}
+BENCHMARK(BM_diagnostic_test_keep_running);
+
 int main(int argc, char* argv[]) {
   benchmark::internal::GetAbortHandler() = &TestHandler;
   benchmark::Initialize(&argc, argv);
diff --git a/test/filter_test.cc b/test/filter_test.cc
index 3a20529..0e27065 100644
--- a/test/filter_test.cc
+++ b/test/filter_test.cc
@@ -36,31 +36,31 @@
 }  // end namespace
 
 static void NoPrefix(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
   }
 }
 BENCHMARK(NoPrefix);
 
 static void BM_Foo(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
   }
 }
 BENCHMARK(BM_Foo);
 
 static void BM_Bar(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
   }
 }
 BENCHMARK(BM_Bar);
 
 static void BM_FooBar(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
   }
 }
 BENCHMARK(BM_FooBar);
 
 static void BM_FooBa(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
   }
 }
 BENCHMARK(BM_FooBa);
diff --git a/test/fixture_test.cc b/test/fixture_test.cc
index bbc2f95..1462b10 100644
--- a/test/fixture_test.cc
+++ b/test/fixture_test.cc
@@ -28,7 +28,7 @@
 BENCHMARK_F(MyFixture, Foo)(benchmark::State &st) {
   assert(data.get() != nullptr);
   assert(*data == 42);
-  while (st.KeepRunning()) {
+  for (auto _ : st) {
   }
 }
 
@@ -37,7 +37,7 @@
     assert(data.get() != nullptr);
     assert(*data == 42);
   }
-  while (st.KeepRunning()) {
+  for (auto _ : st) {
     assert(data.get() != nullptr);
     assert(*data == 42);
   }
@@ -46,4 +46,4 @@
 BENCHMARK_REGISTER_F(MyFixture, Bar)->Arg(42);
 BENCHMARK_REGISTER_F(MyFixture, Bar)->Arg(42)->ThreadPerCpu();
 
-BENCHMARK_MAIN()
+BENCHMARK_MAIN();
diff --git a/test/map_test.cc b/test/map_test.cc
index 83457c9..311d2d2 100644
--- a/test/map_test.cc
+++ b/test/map_test.cc
@@ -18,9 +18,10 @@
 // Basic version.
 static void BM_MapLookup(benchmark::State& state) {
   const int size = state.range(0);
-  while (state.KeepRunning()) {
+  std::map<int, int> m;
+  for (auto _ : state) {
     state.PauseTiming();
-    std::map<int, int> m = ConstructRandomMap(size);
+    m = ConstructRandomMap(size);
     state.ResumeTiming();
     for (int i = 0; i < size; ++i) {
       benchmark::DoNotOptimize(m.find(rand() % size));
@@ -44,7 +45,7 @@
 
 BENCHMARK_DEFINE_F(MapFixture, Lookup)(benchmark::State& state) {
   const int size = state.range(0);
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
     for (int i = 0; i < size; ++i) {
       benchmark::DoNotOptimize(m.find(rand() % size));
     }
@@ -53,4 +54,4 @@
 }
 BENCHMARK_REGISTER_F(MapFixture, Lookup)->Range(1 << 3, 1 << 12);
 
-BENCHMARK_MAIN()
+BENCHMARK_MAIN();
diff --git a/test/multiple_ranges_test.cc b/test/multiple_ranges_test.cc
index 8e67b3b..0a82382 100644
--- a/test/multiple_ranges_test.cc
+++ b/test/multiple_ranges_test.cc
@@ -43,7 +43,7 @@
 };
 
 BENCHMARK_DEFINE_F(MultipleRangesFixture, Empty)(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
     int product = state.range(0) * state.range(1) * state.range(2);
     for (int x = 0; x < product; x++) {
       benchmark::DoNotOptimize(x);
@@ -60,15 +60,15 @@
   // Test that the 'range()' without an argument is the same as 'range(0)'.
   assert(state.range() == state.range(0));
   assert(state.range() != state.range(1));
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
   }
 }
 BENCHMARK(BM_CheckDefaultArgument)->Ranges({{1, 5}, {6, 10}});
 
 static void BM_MultipleRanges(benchmark::State& st) {
-  while (st.KeepRunning()) {
+  for (auto _ : st) {
   }
 }
 BENCHMARK(BM_MultipleRanges)->Ranges({{5, 5}, {6, 6}});
 
-BENCHMARK_MAIN()
+BENCHMARK_MAIN();
diff --git a/test/options_test.cc b/test/options_test.cc
index 8eac068..fdec691 100644
--- a/test/options_test.cc
+++ b/test/options_test.cc
@@ -8,13 +8,13 @@
 #include <cassert>
 
 void BM_basic(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
   }
 }
 
 void BM_basic_slow(benchmark::State& state) {
   std::chrono::milliseconds sleep_duration(state.range(0));
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
     std::this_thread::sleep_for(
         std::chrono::duration_cast<std::chrono::nanoseconds>(sleep_duration));
   }
@@ -44,7 +44,7 @@
 
 BENCHMARK(BM_basic)->Apply(CustomArgs);
 
-void BM_explicit_iteration_count(benchmark::State& st) {
+void BM_explicit_iteration_count(benchmark::State& state) {
   // Test that benchmarks specified with an explicit iteration count are
   // only run once.
   static bool invoked_before = false;
@@ -52,14 +52,14 @@
   invoked_before = true;
 
   // Test that the requested iteration count is respected.
-  assert(st.max_iterations == 42);
+  assert(state.max_iterations == 42);
   size_t actual_iterations = 0;
-  while (st.KeepRunning())
+  for (auto _ : state)
     ++actual_iterations;
-  assert(st.iterations() == st.max_iterations);
-  assert(st.iterations() == 42);
+  assert(state.iterations() == state.max_iterations);
+  assert(state.iterations() == 42);
 
 }
 BENCHMARK(BM_explicit_iteration_count)->Iterations(42);
 
-BENCHMARK_MAIN()
+BENCHMARK_MAIN();
diff --git a/test/register_benchmark_test.cc b/test/register_benchmark_test.cc
index 2769b7a..8ab2c29 100644
--- a/test/register_benchmark_test.cc
+++ b/test/register_benchmark_test.cc
@@ -61,7 +61,7 @@
 // Test RegisterBenchmark with no additional arguments
 //----------------------------------------------------------------------------//
 void BM_function(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
   }
 }
 BENCHMARK(BM_function);
@@ -77,7 +77,7 @@
 #ifndef BENCHMARK_HAS_NO_VARIADIC_REGISTER_BENCHMARK
 
 void BM_extra_args(benchmark::State& st, const char* label) {
-  while (st.KeepRunning()) {
+  for (auto _ : st) {
   }
   st.SetLabel(label);
 }
@@ -99,7 +99,7 @@
 
 struct CustomFixture {
   void operator()(benchmark::State& st) {
-    while (st.KeepRunning()) {
+    for (auto _ : st) {
     }
   }
 };
@@ -116,7 +116,7 @@
   {
     const char* x = "42";
     auto capturing_lam = [=](benchmark::State& st) {
-      while (st.KeepRunning()) {
+      for (auto _ : st) {
       }
       st.SetLabel(x);
     };
diff --git a/test/reporter_output_test.cc b/test/reporter_output_test.cc
index 4a48143..1620b31 100644
--- a/test/reporter_output_test.cc
+++ b/test/reporter_output_test.cc
@@ -13,6 +13,41 @@
           {{"^[-]+$", MR_Next},
            {"^Benchmark %s Time %s CPU %s Iterations$", MR_Next},
            {"^[-]+$", MR_Next}});
+static int AddContextCases() {
+  AddCases(TC_ConsoleErr,
+           {
+               {"%int[-/]%int[-/]%int %int:%int:%int$", MR_Default},
+               {"Run on \\(%int X %float MHz CPU s\\)", MR_Next},
+           });
+  AddCases(TC_JSONOut, {{"^\\{", MR_Default},
+                        {"\"context\":", MR_Next},
+                        {"\"date\": \"", MR_Next},
+                        {"\"num_cpus\": %int,$", MR_Next},
+                        {"\"mhz_per_cpu\": %float,$", MR_Next},
+                        {"\"cpu_scaling_enabled\": ", MR_Next},
+                        {"\"caches\": \\[$", MR_Next}});
+  auto const& Caches = benchmark::CPUInfo::Get().caches;
+  if (!Caches.empty()) {
+    AddCases(TC_ConsoleErr, {{"CPU Caches:$", MR_Next}});
+  }
+  for (size_t I = 0; I < Caches.size(); ++I) {
+    std::string num_caches_str =
+        Caches[I].num_sharing != 0 ? " \\(x%int\\)$" : "$";
+    AddCases(
+        TC_ConsoleErr,
+        {{"L%int (Data|Instruction|Unified) %intK" + num_caches_str, MR_Next}});
+    AddCases(TC_JSONOut, {{"\\{$", MR_Next},
+                          {"\"type\": \"", MR_Next},
+                          {"\"level\": %int,$", MR_Next},
+                          {"\"size\": %int,$", MR_Next},
+                          {"\"num_sharing\": %int$", MR_Next},
+                          {"}[,]{0,1}$", MR_Next}});
+  }
+
+  AddCases(TC_JSONOut, {{"],$"}});
+  return 0;
+}
+int dummy_register = AddContextCases();
 ADD_CASES(TC_CSVOut, {{"%csv_header"}});
 
 // ========================================================================= //
@@ -20,7 +55,7 @@
 // ========================================================================= //
 
 void BM_basic(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
   }
 }
 BENCHMARK(BM_basic);
@@ -28,8 +63,8 @@
 ADD_CASES(TC_ConsoleOut, {{"^BM_basic %console_report$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_basic\",$"},
                        {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %int,$", MR_Next},
-                       {"\"cpu_time\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
                        {"\"time_unit\": \"ns\"$", MR_Next},
                        {"}", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_basic\",%csv_report$"}});
@@ -39,20 +74,20 @@
 // ========================================================================= //
 
 void BM_bytes_per_second(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
   }
   state.SetBytesProcessed(1);
 }
 BENCHMARK(BM_bytes_per_second);
 
 ADD_CASES(TC_ConsoleOut,
-          {{"^BM_bytes_per_second %console_report +%floatB/s$"}});
+          {{"^BM_bytes_per_second %console_report +%float[kM]{0,1}B/s$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_bytes_per_second\",$"},
                        {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %int,$", MR_Next},
-                       {"\"cpu_time\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
                        {"\"time_unit\": \"ns\",$", MR_Next},
-                       {"\"bytes_per_second\": %int$", MR_Next},
+                       {"\"bytes_per_second\": %float$", MR_Next},
                        {"}", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_bytes_per_second\",%csv_bytes_report$"}});
 
@@ -61,20 +96,20 @@
 // ========================================================================= //
 
 void BM_items_per_second(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
   }
   state.SetItemsProcessed(1);
 }
 BENCHMARK(BM_items_per_second);
 
 ADD_CASES(TC_ConsoleOut,
-          {{"^BM_items_per_second %console_report +%float items/s$"}});
+          {{"^BM_items_per_second %console_report +%float[kM]{0,1} items/s$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_items_per_second\",$"},
                        {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %int,$", MR_Next},
-                       {"\"cpu_time\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
                        {"\"time_unit\": \"ns\",$", MR_Next},
-                       {"\"items_per_second\": %int$", MR_Next},
+                       {"\"items_per_second\": %float$", MR_Next},
                        {"}", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_items_per_second\",%csv_items_report$"}});
 
@@ -83,7 +118,7 @@
 // ========================================================================= //
 
 void BM_label(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
   }
   state.SetLabel("some label");
 }
@@ -92,8 +127,8 @@
 ADD_CASES(TC_ConsoleOut, {{"^BM_label %console_report some label$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_label\",$"},
                        {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %int,$", MR_Next},
-                       {"\"cpu_time\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
                        {"\"time_unit\": \"ns\",$", MR_Next},
                        {"\"label\": \"some label\"$", MR_Next},
                        {"}", MR_Next}});
@@ -106,7 +141,7 @@
 
 void BM_error(benchmark::State& state) {
   state.SkipWithError("message");
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
   }
 }
 BENCHMARK(BM_error);
@@ -123,7 +158,7 @@
 // ========================================================================= //
 
 void BM_no_arg_name(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
   }
 }
 BENCHMARK(BM_no_arg_name)->Arg(3);
@@ -136,7 +171,7 @@
 // ========================================================================= //
 
 void BM_arg_name(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
   }
 }
 BENCHMARK(BM_arg_name)->ArgName("first")->Arg(3);
@@ -149,7 +184,7 @@
 // ========================================================================= //
 
 void BM_arg_names(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
   }
 }
 BENCHMARK(BM_arg_names)->Args({2, 5, 4})->ArgNames({"first", "", "third"});
@@ -163,7 +198,7 @@
 // ========================================================================= //
 
 void BM_Complexity_O1(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
   }
   state.SetComplexityN(state.range(0));
 }
@@ -179,30 +214,74 @@
 
 // Test that non-aggregate data is printed by default
 void BM_Repeat(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
   }
 }
+// need two repetitions min to be able to output any aggregate output
+BENCHMARK(BM_Repeat)->Repetitions(2);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Repeat/repeats:2 %console_report$"},
+                          {"^BM_Repeat/repeats:2 %console_report$"},
+                          {"^BM_Repeat/repeats:2_mean %console_report$"},
+                          {"^BM_Repeat/repeats:2_median %console_report$"},
+                          {"^BM_Repeat/repeats:2_stddev %console_report$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Repeat/repeats:2\",$"},
+                       {"\"name\": \"BM_Repeat/repeats:2\",$"},
+                       {"\"name\": \"BM_Repeat/repeats:2_mean\",$"},
+                       {"\"name\": \"BM_Repeat/repeats:2_median\",$"},
+                       {"\"name\": \"BM_Repeat/repeats:2_stddev\",$"}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Repeat/repeats:2\",%csv_report$"},
+                      {"^\"BM_Repeat/repeats:2\",%csv_report$"},
+                      {"^\"BM_Repeat/repeats:2_mean\",%csv_report$"},
+                      {"^\"BM_Repeat/repeats:2_median\",%csv_report$"},
+                      {"^\"BM_Repeat/repeats:2_stddev\",%csv_report$"}});
+// but for two repetitions, mean and median is the same, so let's repeat..
 BENCHMARK(BM_Repeat)->Repetitions(3);
 ADD_CASES(TC_ConsoleOut, {{"^BM_Repeat/repeats:3 %console_report$"},
                           {"^BM_Repeat/repeats:3 %console_report$"},
                           {"^BM_Repeat/repeats:3 %console_report$"},
                           {"^BM_Repeat/repeats:3_mean %console_report$"},
+                          {"^BM_Repeat/repeats:3_median %console_report$"},
                           {"^BM_Repeat/repeats:3_stddev %console_report$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Repeat/repeats:3\",$"},
                        {"\"name\": \"BM_Repeat/repeats:3\",$"},
                        {"\"name\": \"BM_Repeat/repeats:3\",$"},
                        {"\"name\": \"BM_Repeat/repeats:3_mean\",$"},
+                       {"\"name\": \"BM_Repeat/repeats:3_median\",$"},
                        {"\"name\": \"BM_Repeat/repeats:3_stddev\",$"}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_Repeat/repeats:3\",%csv_report$"},
                       {"^\"BM_Repeat/repeats:3\",%csv_report$"},
                       {"^\"BM_Repeat/repeats:3\",%csv_report$"},
                       {"^\"BM_Repeat/repeats:3_mean\",%csv_report$"},
+                      {"^\"BM_Repeat/repeats:3_median\",%csv_report$"},
                       {"^\"BM_Repeat/repeats:3_stddev\",%csv_report$"}});
+// median differs between even/odd number of repetitions, so just to be sure
+BENCHMARK(BM_Repeat)->Repetitions(4);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Repeat/repeats:4 %console_report$"},
+                          {"^BM_Repeat/repeats:4 %console_report$"},
+                          {"^BM_Repeat/repeats:4 %console_report$"},
+                          {"^BM_Repeat/repeats:4 %console_report$"},
+                          {"^BM_Repeat/repeats:4_mean %console_report$"},
+                          {"^BM_Repeat/repeats:4_median %console_report$"},
+                          {"^BM_Repeat/repeats:4_stddev %console_report$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Repeat/repeats:4\",$"},
+                       {"\"name\": \"BM_Repeat/repeats:4\",$"},
+                       {"\"name\": \"BM_Repeat/repeats:4\",$"},
+                       {"\"name\": \"BM_Repeat/repeats:4\",$"},
+                       {"\"name\": \"BM_Repeat/repeats:4_mean\",$"},
+                       {"\"name\": \"BM_Repeat/repeats:4_median\",$"},
+                       {"\"name\": \"BM_Repeat/repeats:4_stddev\",$"}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Repeat/repeats:4\",%csv_report$"},
+                      {"^\"BM_Repeat/repeats:4\",%csv_report$"},
+                      {"^\"BM_Repeat/repeats:4\",%csv_report$"},
+                      {"^\"BM_Repeat/repeats:4\",%csv_report$"},
+                      {"^\"BM_Repeat/repeats:4_mean\",%csv_report$"},
+                      {"^\"BM_Repeat/repeats:4_median\",%csv_report$"},
+                      {"^\"BM_Repeat/repeats:4_stddev\",%csv_report$"}});
 
 // Test that a non-repeated test still prints non-aggregate results even when
 // only-aggregate reports have been requested
 void BM_RepeatOnce(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
   }
 }
 BENCHMARK(BM_RepeatOnce)->Repetitions(1)->ReportAggregatesOnly();
@@ -212,23 +291,26 @@
 
 // Test that non-aggregate data is not reported
 void BM_SummaryRepeat(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
   }
 }
 BENCHMARK(BM_SummaryRepeat)->Repetitions(3)->ReportAggregatesOnly();
 ADD_CASES(TC_ConsoleOut,
           {{".*BM_SummaryRepeat/repeats:3 ", MR_Not},
            {"^BM_SummaryRepeat/repeats:3_mean %console_report$"},
+           {"^BM_SummaryRepeat/repeats:3_median %console_report$"},
            {"^BM_SummaryRepeat/repeats:3_stddev %console_report$"}});
 ADD_CASES(TC_JSONOut, {{".*BM_SummaryRepeat/repeats:3 ", MR_Not},
                        {"\"name\": \"BM_SummaryRepeat/repeats:3_mean\",$"},
+                       {"\"name\": \"BM_SummaryRepeat/repeats:3_median\",$"},
                        {"\"name\": \"BM_SummaryRepeat/repeats:3_stddev\",$"}});
 ADD_CASES(TC_CSVOut, {{".*BM_SummaryRepeat/repeats:3 ", MR_Not},
                       {"^\"BM_SummaryRepeat/repeats:3_mean\",%csv_report$"},
+                      {"^\"BM_SummaryRepeat/repeats:3_median\",%csv_report$"},
                       {"^\"BM_SummaryRepeat/repeats:3_stddev\",%csv_report$"}});
 
 void BM_RepeatTimeUnit(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
   }
 }
 BENCHMARK(BM_RepeatTimeUnit)
@@ -238,18 +320,60 @@
 ADD_CASES(TC_ConsoleOut,
           {{".*BM_RepeatTimeUnit/repeats:3 ", MR_Not},
            {"^BM_RepeatTimeUnit/repeats:3_mean %console_us_report$"},
+           {"^BM_RepeatTimeUnit/repeats:3_median %console_us_report$"},
            {"^BM_RepeatTimeUnit/repeats:3_stddev %console_us_report$"}});
 ADD_CASES(TC_JSONOut, {{".*BM_RepeatTimeUnit/repeats:3 ", MR_Not},
                        {"\"name\": \"BM_RepeatTimeUnit/repeats:3_mean\",$"},
                        {"\"time_unit\": \"us\",?$"},
+                       {"\"name\": \"BM_RepeatTimeUnit/repeats:3_median\",$"},
+                       {"\"time_unit\": \"us\",?$"},
                        {"\"name\": \"BM_RepeatTimeUnit/repeats:3_stddev\",$"},
                        {"\"time_unit\": \"us\",?$"}});
 ADD_CASES(TC_CSVOut,
           {{".*BM_RepeatTimeUnit/repeats:3 ", MR_Not},
            {"^\"BM_RepeatTimeUnit/repeats:3_mean\",%csv_us_report$"},
+           {"^\"BM_RepeatTimeUnit/repeats:3_median\",%csv_us_report$"},
            {"^\"BM_RepeatTimeUnit/repeats:3_stddev\",%csv_us_report$"}});
 
 // ========================================================================= //
+// -------------------- Testing user-provided statistics ------------------- //
+// ========================================================================= //
+
+const auto UserStatistics = [](const std::vector<double>& v) {
+  return v.back();
+};
+void BM_UserStats(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_UserStats)
+    ->Repetitions(3)
+    ->ComputeStatistics("", UserStatistics);
+// check that user-provided stats is calculated, and is after the default-ones
+// empty string as name is intentional, it would sort before anything else
+ADD_CASES(TC_ConsoleOut, {{"^BM_UserStats/repeats:3 %console_report$"},
+                          {"^BM_UserStats/repeats:3 %console_report$"},
+                          {"^BM_UserStats/repeats:3 %console_report$"},
+                          {"^BM_UserStats/repeats:3_mean %console_report$"},
+                          {"^BM_UserStats/repeats:3_median %console_report$"},
+                          {"^BM_UserStats/repeats:3_stddev %console_report$"},
+                          {"^BM_UserStats/repeats:3_ %console_report$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_UserStats/repeats:3\",$"},
+                       {"\"name\": \"BM_UserStats/repeats:3\",$"},
+                       {"\"name\": \"BM_UserStats/repeats:3\",$"},
+                       {"\"name\": \"BM_UserStats/repeats:3_mean\",$"},
+                       {"\"name\": \"BM_UserStats/repeats:3_median\",$"},
+                       {"\"name\": \"BM_UserStats/repeats:3_stddev\",$"},
+                       {"\"name\": \"BM_UserStats/repeats:3_\",$"}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_UserStats/repeats:3\",%csv_report$"},
+                      {"^\"BM_UserStats/repeats:3\",%csv_report$"},
+                      {"^\"BM_UserStats/repeats:3\",%csv_report$"},
+                      {"^\"BM_UserStats/repeats:3_mean\",%csv_report$"},
+                      {"^\"BM_UserStats/repeats:3_median\",%csv_report$"},
+                      {"^\"BM_UserStats/repeats:3_stddev\",%csv_report$"},
+                      {"^\"BM_UserStats/repeats:3_\",%csv_report$"}});
+
+// ========================================================================= //
 // --------------------------- TEST CASES END ------------------------------ //
 // ========================================================================= //
 
diff --git a/test/skip_with_error_test.cc b/test/skip_with_error_test.cc
index b74d33c..0c2f348 100644
--- a/test/skip_with_error_test.cc
+++ b/test/skip_with_error_test.cc
@@ -70,6 +70,15 @@
 BENCHMARK(BM_error_before_running);
 ADD_CASES("BM_error_before_running", {{"", true, "error message"}});
 
+void BM_error_before_running_range_for(benchmark::State& state) {
+  state.SkipWithError("error message");
+  for (auto _ : state) {
+    assert(false);
+  }
+}
+BENCHMARK(BM_error_before_running_range_for);
+ADD_CASES("BM_error_before_running_range_for", {{"", true, "error message"}});
+
 void BM_error_during_running(benchmark::State& state) {
   int first_iter = true;
   while (state.KeepRunning()) {
@@ -93,8 +102,31 @@
                                       {"/2/threads:4", false, ""},
                                       {"/2/threads:8", false, ""}});
 
+void BM_error_during_running_ranged_for(benchmark::State& state) {
+  assert(state.max_iterations > 3 && "test requires at least a few iterations");
+  int first_iter = true;
+  // NOTE: Users should not write the for loop explicitly.
+  for (auto It = state.begin(), End = state.end(); It != End; ++It) {
+    if (state.range(0) == 1) {
+      assert(first_iter);
+      first_iter = false;
+      state.SkipWithError("error message");
+      // Test the unfortunate but documented behavior that the ranged-for loop
+      // doesn't automatically terminate when SkipWithError is set.
+      assert(++It != End);
+      break; // Required behavior
+    }
+  }
+}
+BENCHMARK(BM_error_during_running_ranged_for)->Arg(1)->Arg(2)->Iterations(5);
+ADD_CASES("BM_error_during_running_ranged_for",
+          {{"/1/iterations:5", true, "error message"},
+           {"/2/iterations:5", false, ""}});
+
+
+
 void BM_error_after_running(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
     benchmark::DoNotOptimize(state.iterations());
   }
   if (state.thread_index <= (state.threads / 2))
diff --git a/test/statistics_test.cc b/test/statistics_test.cc
new file mode 100644
index 0000000..b4d6abb
--- /dev/null
+++ b/test/statistics_test.cc
@@ -0,0 +1,61 @@
+//===---------------------------------------------------------------------===//
+// statistics_test - Unit tests for src/statistics.cc
+//===---------------------------------------------------------------------===//
+
+#include "../src/statistics.h"
+#include "gtest/gtest.h"
+
+namespace {
+TEST(StatisticsTest, Mean) {
+  std::vector<double> Inputs;
+  {
+    Inputs = {42, 42, 42, 42};
+    double Res = benchmark::StatisticsMean(Inputs);
+    EXPECT_DOUBLE_EQ(Res, 42.0);
+  }
+  {
+    Inputs = {1, 2, 3, 4};
+    double Res = benchmark::StatisticsMean(Inputs);
+    EXPECT_DOUBLE_EQ(Res, 2.5);
+  }
+  {
+    Inputs = {1, 2, 5, 10, 10, 14};
+    double Res = benchmark::StatisticsMean(Inputs);
+    EXPECT_DOUBLE_EQ(Res, 7.0);
+  }
+}
+
+TEST(StatisticsTest, Median) {
+  std::vector<double> Inputs;
+  {
+    Inputs = {42, 42, 42, 42};
+    double Res = benchmark::StatisticsMedian(Inputs);
+    EXPECT_DOUBLE_EQ(Res, 42.0);
+  }
+  {
+    Inputs = {1, 2, 3, 4};
+    double Res = benchmark::StatisticsMedian(Inputs);
+    EXPECT_DOUBLE_EQ(Res, 2.5);
+  }
+  {
+    Inputs = {1, 2, 5, 10, 10};
+    double Res = benchmark::StatisticsMedian(Inputs);
+    EXPECT_DOUBLE_EQ(Res, 5.0);
+  }
+}
+
+TEST(StatisticsTest, StdDev) {
+  std::vector<double> Inputs;
+  {
+    Inputs = {101, 101, 101, 101};
+    double Res = benchmark::StatisticsStdDev(Inputs);
+    EXPECT_DOUBLE_EQ(Res, 0.0);
+  }
+  {
+    Inputs = {1, 2, 3};
+    double Res = benchmark::StatisticsStdDev(Inputs);
+    EXPECT_DOUBLE_EQ(Res, 1.0);
+  }
+}
+
+}  // end namespace
diff --git a/test/templated_fixture_test.cc b/test/templated_fixture_test.cc
new file mode 100644
index 0000000..ec5b4c0
--- /dev/null
+++ b/test/templated_fixture_test.cc
@@ -0,0 +1,28 @@
+
+#include "benchmark/benchmark.h"
+
+#include <cassert>
+#include <memory>
+
+template<typename T>
+class MyFixture : public ::benchmark::Fixture {
+public:
+  MyFixture() : data(0) {}
+
+  T data;
+};
+
+BENCHMARK_TEMPLATE_F(MyFixture, Foo, int)(benchmark::State &st) {
+  for (auto _ : st) {
+    data += 1;
+  }
+}
+
+BENCHMARK_TEMPLATE_DEFINE_F(MyFixture, Bar, double)(benchmark::State& st) {
+  for (auto _ : st) {
+    data += 1.0;
+  }
+}
+BENCHMARK_REGISTER_F(MyFixture, Bar);
+
+BENCHMARK_MAIN();
diff --git a/test/user_counters_tabular_test.cc b/test/user_counters_tabular_test.cc
index 5fc5b4d..9b8a613 100644
--- a/test/user_counters_tabular_test.cc
+++ b/test/user_counters_tabular_test.cc
@@ -54,7 +54,7 @@
 // ========================================================================= //
 
 void BM_Counters_Tabular(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
   }
   namespace bm = benchmark;
   state.counters.insert({
@@ -69,8 +69,8 @@
 BENCHMARK(BM_Counters_Tabular)->ThreadRange(1, 16);
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_Tabular/threads:%int\",$"},
                        {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %int,$", MR_Next},
-                       {"\"cpu_time\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
                        {"\"time_unit\": \"ns\",$", MR_Next},
                        {"\"Bar\": %float,$", MR_Next},
                        {"\"Bat\": %float,$", MR_Next},
@@ -98,7 +98,7 @@
 // ========================================================================= //
 
 void BM_CounterRates_Tabular(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
   }
   namespace bm = benchmark;
   state.counters.insert({
@@ -113,8 +113,8 @@
 BENCHMARK(BM_CounterRates_Tabular)->ThreadRange(1, 16);
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_CounterRates_Tabular/threads:%int\",$"},
                        {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %int,$", MR_Next},
-                       {"\"cpu_time\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
                        {"\"time_unit\": \"ns\",$", MR_Next},
                        {"\"Bar\": %float,$", MR_Next},
                        {"\"Bat\": %float,$", MR_Next},
@@ -145,7 +145,7 @@
 
 // set only some of the counters
 void BM_CounterSet0_Tabular(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
   }
   namespace bm = benchmark;
   state.counters.insert({
@@ -157,8 +157,8 @@
 BENCHMARK(BM_CounterSet0_Tabular)->ThreadRange(1, 16);
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_CounterSet0_Tabular/threads:%int\",$"},
                        {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %int,$", MR_Next},
-                       {"\"cpu_time\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
                        {"\"time_unit\": \"ns\",$", MR_Next},
                        {"\"Bar\": %float,$", MR_Next},
                        {"\"Baz\": %float,$", MR_Next},
@@ -177,7 +177,7 @@
 
 // again.
 void BM_CounterSet1_Tabular(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
   }
   namespace bm = benchmark;
   state.counters.insert({
@@ -189,8 +189,8 @@
 BENCHMARK(BM_CounterSet1_Tabular)->ThreadRange(1, 16);
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_CounterSet1_Tabular/threads:%int\",$"},
                        {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %int,$", MR_Next},
-                       {"\"cpu_time\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
                        {"\"time_unit\": \"ns\",$", MR_Next},
                        {"\"Bar\": %float,$", MR_Next},
                        {"\"Baz\": %float,$", MR_Next},
@@ -213,7 +213,7 @@
 
 // set only some of the counters, different set now.
 void BM_CounterSet2_Tabular(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
   }
   namespace bm = benchmark;
   state.counters.insert({
@@ -225,8 +225,8 @@
 BENCHMARK(BM_CounterSet2_Tabular)->ThreadRange(1, 16);
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_CounterSet2_Tabular/threads:%int\",$"},
                        {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %int,$", MR_Next},
-                       {"\"cpu_time\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
                        {"\"time_unit\": \"ns\",$", MR_Next},
                        {"\"Bat\": %float,$", MR_Next},
                        {"\"Baz\": %float,$", MR_Next},
diff --git a/test/user_counters_test.cc b/test/user_counters_test.cc
index 66df48b..06aafb1 100644
--- a/test/user_counters_test.cc
+++ b/test/user_counters_test.cc
@@ -19,7 +19,7 @@
 // ========================================================================= //
 
 void BM_Counters_Simple(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
   }
   state.counters["foo"] = 1;
   state.counters["bar"] = 2 * (double)state.iterations();
@@ -28,8 +28,8 @@
 ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_Simple %console_report bar=%hrfloat foo=%hrfloat$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_Simple\",$"},
                        {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %int,$", MR_Next},
-                       {"\"cpu_time\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
                        {"\"time_unit\": \"ns\",$", MR_Next},
                        {"\"bar\": %float,$", MR_Next},
                        {"\"foo\": %float$", MR_Next},
@@ -51,7 +51,7 @@
 
 namespace { int num_calls1 = 0; }
 void BM_Counters_WithBytesAndItemsPSec(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
   }
   state.counters["foo"] = 1;
   state.counters["bar"] = ++num_calls1;
@@ -64,11 +64,11 @@
             "bar=%hrfloat foo=%hrfloat +%hrfloatB/s +%hrfloat items/s$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_WithBytesAndItemsPSec\",$"},
                        {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %int,$", MR_Next},
-                       {"\"cpu_time\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
                        {"\"time_unit\": \"ns\",$", MR_Next},
-                       {"\"bytes_per_second\": %int,$", MR_Next},
-                       {"\"items_per_second\": %int,$", MR_Next},
+                       {"\"bytes_per_second\": %float,$", MR_Next},
+                       {"\"items_per_second\": %float,$", MR_Next},
                        {"\"bar\": %float,$", MR_Next},
                        {"\"foo\": %float$", MR_Next},
                        {"}", MR_Next}});
@@ -92,7 +92,7 @@
 // ========================================================================= //
 
 void BM_Counters_Rate(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
   }
   namespace bm = benchmark;
   state.counters["foo"] = bm::Counter{1, bm::Counter::kIsRate};
@@ -102,8 +102,8 @@
 ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_Rate %console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_Rate\",$"},
                        {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %int,$", MR_Next},
-                       {"\"cpu_time\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
                        {"\"time_unit\": \"ns\",$", MR_Next},
                        {"\"bar\": %float,$", MR_Next},
                        {"\"foo\": %float$", MR_Next},
@@ -124,7 +124,7 @@
 // ========================================================================= //
 
 void BM_Counters_Threads(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
   }
   state.counters["foo"] = 1;
   state.counters["bar"] = 2;
@@ -133,8 +133,8 @@
 ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_Threads/threads:%int %console_report bar=%hrfloat foo=%hrfloat$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_Threads/threads:%int\",$"},
                        {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %int,$", MR_Next},
-                       {"\"cpu_time\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
                        {"\"time_unit\": \"ns\",$", MR_Next},
                        {"\"bar\": %float,$", MR_Next},
                        {"\"foo\": %float$", MR_Next},
@@ -153,7 +153,7 @@
 // ========================================================================= //
 
 void BM_Counters_AvgThreads(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
   }
   namespace bm = benchmark;
   state.counters["foo"] = bm::Counter{1, bm::Counter::kAvgThreads};
@@ -163,8 +163,8 @@
 ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_AvgThreads/threads:%int %console_report bar=%hrfloat foo=%hrfloat$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_AvgThreads/threads:%int\",$"},
                        {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %int,$", MR_Next},
-                       {"\"cpu_time\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
                        {"\"time_unit\": \"ns\",$", MR_Next},
                        {"\"bar\": %float,$", MR_Next},
                        {"\"foo\": %float$", MR_Next},
@@ -184,7 +184,7 @@
 // ========================================================================= //
 
 void BM_Counters_AvgThreadsRate(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
   }
   namespace bm = benchmark;
   state.counters["foo"] = bm::Counter{1, bm::Counter::kAvgThreadsRate};
@@ -194,8 +194,8 @@
 ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_AvgThreadsRate/threads:%int %console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_AvgThreadsRate/threads:%int\",$"},
                        {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %int,$", MR_Next},
-                       {"\"cpu_time\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
                        {"\"time_unit\": \"ns\",$", MR_Next},
                        {"\"bar\": %float,$", MR_Next},
                        {"\"foo\": %float$", MR_Next},
diff --git a/tools/compare.py b/tools/compare.py
new file mode 100755
index 0000000..c4a47e8
--- /dev/null
+++ b/tools/compare.py
@@ -0,0 +1,312 @@
+#!/usr/bin/env python
+
+"""
+compare.py - versatile benchmark output compare tool
+"""
+
+import argparse
+from argparse import ArgumentParser
+import sys
+import gbench
+from gbench import util, report
+from gbench.util import *
+
+
+def check_inputs(in1, in2, flags):
+    """
+    Perform checking on the user provided inputs and diagnose any abnormalities
+    """
+    in1_kind, in1_err = classify_input_file(in1)
+    in2_kind, in2_err = classify_input_file(in2)
+    output_file = find_benchmark_flag('--benchmark_out=', flags)
+    output_type = find_benchmark_flag('--benchmark_out_format=', flags)
+    if in1_kind == IT_Executable and in2_kind == IT_Executable and output_file:
+        print(("WARNING: '--benchmark_out=%s' will be passed to both "
+               "benchmarks causing it to be overwritten") % output_file)
+    if in1_kind == IT_JSON and in2_kind == IT_JSON and len(flags) > 0:
+        print("WARNING: passing optional flags has no effect since both "
+              "inputs are JSON")
+    if output_type is not None and output_type != 'json':
+        print(("ERROR: passing '--benchmark_out_format=%s' to 'compare.py`"
+               " is not supported.") % output_type)
+        sys.exit(1)
+
+
+def create_parser():
+    parser = ArgumentParser(
+        description='versatile benchmark output compare tool')
+    subparsers = parser.add_subparsers(
+        help='This tool has multiple modes of operation:',
+        dest='mode')
+
+    parser_a = subparsers.add_parser(
+        'benchmarks',
+        help='The most simple use-case, compare all the output of these two benchmarks')
+    baseline = parser_a.add_argument_group(
+        'baseline', 'The benchmark baseline')
+    baseline.add_argument(
+        'test_baseline',
+        metavar='test_baseline',
+        type=argparse.FileType('r'),
+        nargs=1,
+        help='A benchmark executable or JSON output file')
+    contender = parser_a.add_argument_group(
+        'contender', 'The benchmark that will be compared against the baseline')
+    contender.add_argument(
+        'test_contender',
+        metavar='test_contender',
+        type=argparse.FileType('r'),
+        nargs=1,
+        help='A benchmark executable or JSON output file')
+    parser_a.add_argument(
+        'benchmark_options',
+        metavar='benchmark_options',
+        nargs=argparse.REMAINDER,
+        help='Arguments to pass when running benchmark executables')
+
+    parser_b = subparsers.add_parser(
+        'filters', help='Compare filter one with the filter two of benchmark')
+    baseline = parser_b.add_argument_group(
+        'baseline', 'The benchmark baseline')
+    baseline.add_argument(
+        'test',
+        metavar='test',
+        type=argparse.FileType('r'),
+        nargs=1,
+        help='A benchmark executable or JSON output file')
+    baseline.add_argument(
+        'filter_baseline',
+        metavar='filter_baseline',
+        type=str,
+        nargs=1,
+        help='The first filter, that will be used as baseline')
+    contender = parser_b.add_argument_group(
+        'contender', 'The benchmark that will be compared against the baseline')
+    contender.add_argument(
+        'filter_contender',
+        metavar='filter_contender',
+        type=str,
+        nargs=1,
+        help='The second filter, that will be compared against the baseline')
+    parser_b.add_argument(
+        'benchmark_options',
+        metavar='benchmark_options',
+        nargs=argparse.REMAINDER,
+        help='Arguments to pass when running benchmark executables')
+
+    parser_c = subparsers.add_parser(
+        'benchmarksfiltered',
+        help='Compare filter one of first benchmark with filter two of the second benchmark')
+    baseline = parser_c.add_argument_group(
+        'baseline', 'The benchmark baseline')
+    baseline.add_argument(
+        'test_baseline',
+        metavar='test_baseline',
+        type=argparse.FileType('r'),
+        nargs=1,
+        help='A benchmark executable or JSON output file')
+    baseline.add_argument(
+        'filter_baseline',
+        metavar='filter_baseline',
+        type=str,
+        nargs=1,
+        help='The first filter, that will be used as baseline')
+    contender = parser_c.add_argument_group(
+        'contender', 'The benchmark that will be compared against the baseline')
+    contender.add_argument(
+        'test_contender',
+        metavar='test_contender',
+        type=argparse.FileType('r'),
+        nargs=1,
+        help='The second benchmark executable or JSON output file, that will be compared against the baseline')
+    contender.add_argument(
+        'filter_contender',
+        metavar='filter_contender',
+        type=str,
+        nargs=1,
+        help='The second filter, that will be compared against the baseline')
+    parser_c.add_argument(
+        'benchmark_options',
+        metavar='benchmark_options',
+        nargs=argparse.REMAINDER,
+        help='Arguments to pass when running benchmark executables')
+
+    return parser
+
+
+def main():
+    # Parse the command line flags
+    parser = create_parser()
+    args, unknown_args = parser.parse_known_args()
+    assert not unknown_args
+    benchmark_options = args.benchmark_options
+
+    if args.mode == 'benchmarks':
+        test_baseline = args.test_baseline[0].name
+        test_contender = args.test_contender[0].name
+        filter_baseline = ''
+        filter_contender = ''
+
+        # NOTE: if test_baseline == test_contender, you are analyzing the stdev
+
+        description = 'Comparing %s to %s' % (test_baseline, test_contender)
+    elif args.mode == 'filters':
+        test_baseline = args.test[0].name
+        test_contender = args.test[0].name
+        filter_baseline = args.filter_baseline[0]
+        filter_contender = args.filter_contender[0]
+
+        # NOTE: if filter_baseline == filter_contender, you are analyzing the
+        # stdev
+
+        description = 'Comparing %s to %s (from %s)' % (
+            filter_baseline, filter_contender, args.test[0].name)
+    elif args.mode == 'benchmarksfiltered':
+        test_baseline = args.test_baseline[0].name
+        test_contender = args.test_contender[0].name
+        filter_baseline = args.filter_baseline[0]
+        filter_contender = args.filter_contender[0]
+
+        # NOTE: if test_baseline == test_contender and
+        # filter_baseline == filter_contender, you are analyzing the stdev
+
+        description = 'Comparing %s (from %s) to %s (from %s)' % (
+            filter_baseline, test_baseline, filter_contender, test_contender)
+    else:
+        # should never happen
+        print("Unrecognized mode of operation: '%s'" % args.mode)
+        exit(1)
+
+    check_inputs(test_baseline, test_contender, benchmark_options)
+
+    options_baseline = []
+    options_contender = []
+
+    if filter_baseline and filter_contender:
+        options_baseline = ['--benchmark_filter=%s' % filter_baseline]
+        options_contender = ['--benchmark_filter=%s' % filter_contender]
+
+    # Run the benchmarks and report the results
+    json1 = json1_orig = gbench.util.run_or_load_benchmark(
+        test_baseline, benchmark_options + options_baseline)
+    json2 = json2_orig = gbench.util.run_or_load_benchmark(
+        test_contender, benchmark_options + options_contender)
+
+    # Now, filter the benchmarks so that the difference report can work
+    if filter_baseline and filter_contender:
+        replacement = '[%s vs. %s]' % (filter_baseline, filter_contender)
+        json1 = gbench.report.filter_benchmark(
+            json1_orig, filter_baseline, replacement)
+        json2 = gbench.report.filter_benchmark(
+            json2_orig, filter_contender, replacement)
+
+    # Diff and output
+    output_lines = gbench.report.generate_difference_report(json1, json2)
+    print(description)
+    for ln in output_lines:
+        print(ln)
+
+
+import unittest
+
+
+class TestParser(unittest.TestCase):
+    def setUp(self):
+        self.parser = create_parser()
+        testInputs = os.path.join(
+            os.path.dirname(
+                os.path.realpath(__file__)),
+            'gbench',
+            'Inputs')
+        self.testInput0 = os.path.join(testInputs, 'test_baseline_run1.json')
+        self.testInput1 = os.path.join(testInputs, 'test_baseline_run2.json')
+
+    def test_benchmarks_basic(self):
+        parsed = self.parser.parse_args(
+            ['benchmarks', self.testInput0, self.testInput1])
+        self.assertEqual(parsed.mode, 'benchmarks')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertFalse(parsed.benchmark_options)
+
+    def test_benchmarks_with_remainder(self):
+        parsed = self.parser.parse_args(
+            ['benchmarks', self.testInput0, self.testInput1, 'd'])
+        self.assertEqual(parsed.mode, 'benchmarks')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertEqual(parsed.benchmark_options, ['d'])
+
+    def test_benchmarks_with_remainder_after_doubleminus(self):
+        parsed = self.parser.parse_args(
+            ['benchmarks', self.testInput0, self.testInput1, '--', 'e'])
+        self.assertEqual(parsed.mode, 'benchmarks')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertEqual(parsed.benchmark_options, ['e'])
+
+    def test_filters_basic(self):
+        parsed = self.parser.parse_args(
+            ['filters', self.testInput0, 'c', 'd'])
+        self.assertEqual(parsed.mode, 'filters')
+        self.assertEqual(parsed.test[0].name, self.testInput0)
+        self.assertEqual(parsed.filter_baseline[0], 'c')
+        self.assertEqual(parsed.filter_contender[0], 'd')
+        self.assertFalse(parsed.benchmark_options)
+
+    def test_filters_with_remainder(self):
+        parsed = self.parser.parse_args(
+            ['filters', self.testInput0, 'c', 'd', 'e'])
+        self.assertEqual(parsed.mode, 'filters')
+        self.assertEqual(parsed.test[0].name, self.testInput0)
+        self.assertEqual(parsed.filter_baseline[0], 'c')
+        self.assertEqual(parsed.filter_contender[0], 'd')
+        self.assertEqual(parsed.benchmark_options, ['e'])
+
+    def test_filters_with_remainder_after_doubleminus(self):
+        parsed = self.parser.parse_args(
+            ['filters', self.testInput0, 'c', 'd', '--', 'f'])
+        self.assertEqual(parsed.mode, 'filters')
+        self.assertEqual(parsed.test[0].name, self.testInput0)
+        self.assertEqual(parsed.filter_baseline[0], 'c')
+        self.assertEqual(parsed.filter_contender[0], 'd')
+        self.assertEqual(parsed.benchmark_options, ['f'])
+
+    def test_benchmarksfiltered_basic(self):
+        parsed = self.parser.parse_args(
+            ['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e'])
+        self.assertEqual(parsed.mode, 'benchmarksfiltered')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.filter_baseline[0], 'c')
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertEqual(parsed.filter_contender[0], 'e')
+        self.assertFalse(parsed.benchmark_options)
+
+    def test_benchmarksfiltered_with_remainder(self):
+        parsed = self.parser.parse_args(
+            ['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e', 'f'])
+        self.assertEqual(parsed.mode, 'benchmarksfiltered')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.filter_baseline[0], 'c')
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertEqual(parsed.filter_contender[0], 'e')
+        self.assertEqual(parsed.benchmark_options[0], 'f')
+
+    def test_benchmarksfiltered_with_remainder_after_doubleminus(self):
+        parsed = self.parser.parse_args(
+            ['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e', '--', 'g'])
+        self.assertEqual(parsed.mode, 'benchmarksfiltered')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.filter_baseline[0], 'c')
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertEqual(parsed.filter_contender[0], 'e')
+        self.assertEqual(parsed.benchmark_options[0], 'g')
+
+
+if __name__ == '__main__':
+    # unittest.main()
+    main()
+
+# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
+# kate: tab-width: 4; replace-tabs on; indent-width 4; tab-indents: off;
+# kate: indent-mode python; remove-trailing-spaces modified;
diff --git a/tools/compare_bench.py b/tools/compare_bench.py
index d54baaa..7bbf0d0 100755
--- a/tools/compare_bench.py
+++ b/tools/compare_bench.py
@@ -39,21 +39,20 @@
     parser.add_argument(
         'test2', metavar='test2', type=str, nargs=1,
         help='A benchmark executable or JSON output file')
-    # FIXME this is a dummy argument which will never actually match
-    # any --benchmark flags but it helps generate a better usage message
     parser.add_argument(
-        'benchmark_options', metavar='benchmark_option', nargs='*',
+        'benchmark_options', metavar='benchmark_options', nargs=argparse.REMAINDER,
         help='Arguments to pass when running benchmark executables'
     )
     args, unknown_args = parser.parse_known_args()
     # Parse the command line flags
     test1 = args.test1[0]
     test2 = args.test2[0]
-    if args.benchmark_options:
+    if unknown_args:
+        # should never happen
         print("Unrecognized positional argument arguments: '%s'"
-              % args.benchmark_options)
+              % unknown_args)
         exit(1)
-    benchmark_options = unknown_args
+    benchmark_options = args.benchmark_options
     check_inputs(test1, test2, benchmark_options)
     # Run the benchmarks and report the results
     json1 = gbench.util.run_or_load_benchmark(test1, benchmark_options)
diff --git a/tools/gbench/Inputs/test1_run1.json b/tools/gbench/Inputs/test1_run1.json
index 37faed4..d7ec6a9 100644
--- a/tools/gbench/Inputs/test1_run1.json
+++ b/tools/gbench/Inputs/test1_run1.json
@@ -29,6 +29,20 @@
       "time_unit": "ns"
     },
     {
+      "name": "BM_1PercentFaster",
+      "iterations": 1000,
+      "real_time": 100,
+      "cpu_time": 100,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_1PercentSlower",
+      "iterations": 1000,
+      "real_time": 100,
+      "cpu_time": 100,
+      "time_unit": "ns"
+    },
+    {
       "name": "BM_10PercentFaster",
       "iterations": 1000,
       "real_time": 100,
@@ -55,6 +69,34 @@
       "real_time": 10000,
       "cpu_time": 10000,
       "time_unit": "ns"
+    },
+    {
+      "name": "BM_10PercentCPUToTime",
+      "iterations": 1000,
+      "real_time": 100,
+      "cpu_time": 100,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_ThirdFaster",
+      "iterations": 1000,
+      "real_time": 100,
+      "cpu_time": 100,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_BadTimeUnit",
+      "iterations": 1000,
+      "real_time": 0.4,
+      "cpu_time": 0.5,
+      "time_unit": "s"
+    },
+    {
+      "name": "BM_DifferentTimeUnit",
+      "iterations": 1,
+      "real_time": 1,
+      "cpu_time": 1,
+      "time_unit": "s"
     }
   ]
-}
\ No newline at end of file
+}
diff --git a/tools/gbench/Inputs/test1_run2.json b/tools/gbench/Inputs/test1_run2.json
index aed5151..59a5ffa 100644
--- a/tools/gbench/Inputs/test1_run2.json
+++ b/tools/gbench/Inputs/test1_run2.json
@@ -29,6 +29,20 @@
       "time_unit": "ns"
     },
     {
+      "name": "BM_1PercentFaster",
+      "iterations": 1000,
+      "real_time": 98.9999999,
+      "cpu_time": 98.9999999,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_1PercentSlower",
+      "iterations": 1000,
+      "real_time": 100.9999999,
+      "cpu_time": 100.9999999,
+      "time_unit": "ns"
+    },
+    {
       "name": "BM_10PercentFaster",
       "iterations": 1000,
       "real_time": 90,
@@ -45,8 +59,8 @@
     {
       "name": "BM_100xSlower",
       "iterations": 1000,
-      "real_time": 10000,
-      "cpu_time": 10000,
+      "real_time": 1.0000e+04,
+      "cpu_time": 1.0000e+04,
       "time_unit": "ns"
     },
     {
@@ -55,6 +69,34 @@
       "real_time": 100,
       "cpu_time": 100,
       "time_unit": "ns"
+    },
+    {
+      "name": "BM_10PercentCPUToTime",
+      "iterations": 1000,
+      "real_time": 110,
+      "cpu_time": 90,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_ThirdFaster",
+      "iterations": 1000,
+      "real_time": 66.665,
+      "cpu_time": 66.664,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_BadTimeUnit",
+      "iterations": 1000,
+      "real_time": 0.04,
+      "cpu_time": 0.6,
+      "time_unit": "s"
+    },
+    {
+      "name": "BM_DifferentTimeUnit",
+      "iterations": 1,
+      "real_time": 1,
+      "cpu_time": 1,
+      "time_unit": "ns"
     }
   ]
-}
\ No newline at end of file
+}
diff --git a/tools/gbench/Inputs/test2_run.json b/tools/gbench/Inputs/test2_run.json
new file mode 100644
index 0000000..15bc698
--- /dev/null
+++ b/tools/gbench/Inputs/test2_run.json
@@ -0,0 +1,81 @@
+{
+  "context": {
+    "date": "2016-08-02 17:44:46",
+    "num_cpus": 4,
+    "mhz_per_cpu": 4228,
+    "cpu_scaling_enabled": false,
+    "library_build_type": "release"
+  },
+  "benchmarks": [
+    {
+      "name": "BM_Hi",
+      "iterations": 1234,
+      "real_time": 42,
+      "cpu_time": 24,
+      "time_unit": "ms"
+    },
+    {
+      "name": "BM_Zero",
+      "iterations": 1000,
+      "real_time": 10,
+      "cpu_time": 10,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_Zero/4",
+      "iterations": 4000,
+      "real_time": 40,
+      "cpu_time": 40,
+      "time_unit": "ns"
+    },
+    {
+      "name": "Prefix/BM_Zero",
+      "iterations": 2000,
+      "real_time": 20,
+      "cpu_time": 20,
+      "time_unit": "ns"
+    },
+    {
+      "name": "Prefix/BM_Zero/3",
+      "iterations": 3000,
+      "real_time": 30,
+      "cpu_time": 30,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_One",
+      "iterations": 5000,
+      "real_time": 5,
+      "cpu_time": 5,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_One/4",
+      "iterations": 2000,
+      "real_time": 20,
+      "cpu_time": 20,
+      "time_unit": "ns"
+    },
+    {
+      "name": "Prefix/BM_One",
+      "iterations": 1000,
+      "real_time": 10,
+      "cpu_time": 10,
+      "time_unit": "ns"
+    },
+    {
+      "name": "Prefix/BM_One/3",
+      "iterations": 1500,
+      "real_time": 15,
+      "cpu_time": 15,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_Bye",
+      "iterations": 5321,
+      "real_time": 11,
+      "cpu_time": 63,
+      "time_unit": "ns"
+    }
+  ]
+}
diff --git a/tools/gbench/report.py b/tools/gbench/report.py
index 015d33d..8d68fe9 100644
--- a/tools/gbench/report.py
+++ b/tools/gbench/report.py
@@ -1,6 +1,8 @@
 """report.py - Utilities for reporting statistics about benchmark results
 """
 import os
+import re
+import copy
 
 class BenchmarkColor(object):
     def __init__(self, name, code):
@@ -66,19 +68,36 @@
     return float(new_val - old_val) / abs(old_val)
 
 
+def filter_benchmark(json_orig, family, replacement=""):
+    """
+    Apply a filter to the json, and only leave the 'family' of benchmarks.
+    """
+    regex = re.compile(family)
+    filtered = {}
+    filtered['benchmarks'] = []
+    for be in json_orig['benchmarks']:
+        if not regex.search(be['name']):
+            continue
+        filteredbench = copy.deepcopy(be) # Do NOT modify the old name!
+        filteredbench['name'] = regex.sub(replacement, filteredbench['name'])
+        filtered['benchmarks'].append(filteredbench)
+    return filtered
+
+
 def generate_difference_report(json1, json2, use_color=True):
     """
     Calculate and report the difference between each test of two benchmarks
     runs specified as 'json1' and 'json2'.
     """
-    first_col_width = find_longest_name(json1['benchmarks']) + 5
+    first_col_width = find_longest_name(json1['benchmarks'])
     def find_test(name):
         for b in json2['benchmarks']:
             if b['name'] == name:
                 return b
         return None
-    first_line = "{:<{}s}     Time           CPU           Old           New".format(
-        'Benchmark', first_col_width)
+    first_col_width = max(first_col_width, len('Benchmark'))
+    first_line = "{:<{}s}Time             CPU      Time Old      Time New       CPU Old       CPU New".format(
+        'Benchmark', 12 + first_col_width)
     output_strs = [first_line, '-' * len(first_line)]
 
     gen = (bn for bn in json1['benchmarks'] if 'real_time' in bn and 'cpu_time' in bn)
@@ -87,6 +106,9 @@
         if not other_bench:
             continue
 
+        if bn['time_unit'] != other_bench['time_unit']:
+            continue
+
         def get_color(res):
             if res > 0.05:
                 return BC_FAIL
@@ -94,12 +116,13 @@
                 return BC_WHITE
             else:
                 return BC_CYAN
-        fmt_str = "{}{:<{}s}{endc}{}{:+9.2f}{endc}{}{:+14.2f}{endc}{:14d}{:14d}"
+        fmt_str = "{}{:<{}s}{endc}{}{:+16.4f}{endc}{}{:+16.4f}{endc}{:14.0f}{:14.0f}{endc}{:14.0f}{:14.0f}"
         tres = calculate_change(bn['real_time'], other_bench['real_time'])
         cpures = calculate_change(bn['cpu_time'], other_bench['cpu_time'])
         output_strs += [color_format(use_color, fmt_str,
             BC_HEADER, bn['name'], first_col_width,
             get_color(tres), tres, get_color(cpures), cpures,
+            bn['real_time'], other_bench['real_time'],
             bn['cpu_time'], other_bench['cpu_time'],
             endc=BC_ENDC)]
     return output_strs
@@ -123,24 +146,63 @@
 
     def test_basic(self):
         expect_lines = [
-            ['BM_SameTimes', '+0.00', '+0.00', '10', '10'],
-            ['BM_2xFaster', '-0.50', '-0.50', '50', '25'],
-            ['BM_2xSlower', '+1.00', '+1.00', '50', '100'],
-            ['BM_10PercentFaster', '-0.10', '-0.10', '100', '90'],
-            ['BM_10PercentSlower', '+0.10', '+0.10', '100', '110'],
-            ['BM_100xSlower', '+99.00', '+99.00', '100', '10000'],
-            ['BM_100xFaster', '-0.99', '-0.99', '10000', '100'],
+            ['BM_SameTimes', '+0.0000', '+0.0000', '10', '10', '10', '10'],
+            ['BM_2xFaster', '-0.5000', '-0.5000', '50', '25', '50', '25'],
+            ['BM_2xSlower', '+1.0000', '+1.0000', '50', '100', '50', '100'],
+            ['BM_1PercentFaster', '-0.0100', '-0.0100', '100', '99', '100', '99'],
+            ['BM_1PercentSlower', '+0.0100', '+0.0100', '100', '101', '100', '101'],
+            ['BM_10PercentFaster', '-0.1000', '-0.1000', '100', '90', '100', '90'],
+            ['BM_10PercentSlower', '+0.1000', '+0.1000', '100', '110', '100', '110'],
+            ['BM_100xSlower', '+99.0000', '+99.0000', '100', '10000', '100', '10000'],
+            ['BM_100xFaster', '-0.9900', '-0.9900', '10000', '100', '10000', '100'],
+            ['BM_10PercentCPUToTime', '+0.1000', '-0.1000', '100', '110', '100', '90'],
+            ['BM_ThirdFaster', '-0.3333', '-0.3334', '100', '67', '100', '67'],
+            ['BM_BadTimeUnit', '-0.9000', '+0.2000', '0', '0', '0', '1'],
         ]
         json1, json2 = self.load_results()
         output_lines_with_header = generate_difference_report(json1, json2, use_color=False)
         output_lines = output_lines_with_header[2:]
         print("\n".join(output_lines_with_header))
         self.assertEqual(len(output_lines), len(expect_lines))
-        for i in xrange(0, len(output_lines)):
+        for i in range(0, len(output_lines)):
             parts = [x for x in output_lines[i].split(' ') if x]
-            self.assertEqual(len(parts), 5)
+            self.assertEqual(len(parts), 7)
+            self.assertEqual(parts, expect_lines[i])
+
+
+class TestReportDifferenceBetweenFamilies(unittest.TestCase):
+    def load_result(self):
+        import json
+        testInputs = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Inputs')
+        testOutput = os.path.join(testInputs, 'test2_run.json')
+        with open(testOutput, 'r') as f:
+            json = json.load(f)
+        return json
+
+    def test_basic(self):
+        expect_lines = [
+            ['.', '-0.5000', '-0.5000', '10', '5', '10', '5'],
+            ['./4', '-0.5000', '-0.5000', '40', '20', '40', '20'],
+            ['Prefix/.', '-0.5000', '-0.5000', '20', '10', '20', '10'],
+            ['Prefix/./3', '-0.5000', '-0.5000', '30', '15', '30', '15'],
+        ]
+        json = self.load_result()
+        json1 = filter_benchmark(json, "BM_Z.ro", ".")
+        json2 = filter_benchmark(json, "BM_O.e", ".")
+        output_lines_with_header = generate_difference_report(json1, json2, use_color=False)
+        output_lines = output_lines_with_header[2:]
+        print "\n"
+        print("\n".join(output_lines_with_header))
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in range(0, len(output_lines)):
+            parts = [x for x in output_lines[i].split(' ') if x]
+            self.assertEqual(len(parts), 7)
             self.assertEqual(parts, expect_lines[i])
 
 
 if __name__ == '__main__':
     unittest.main()
+
+# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
+# kate: tab-width: 4; replace-tabs on; indent-width 4; tab-indents: off;
+# kate: indent-mode python; remove-trailing-spaces modified;
