Merge remote-tracking branch \\'aosp/upstream-master\\' into mymerge am: 01950b6483
am: db1632fadc

Change-Id: I00ff15fbb4177bb0709992ba0f4b29e303e3f72e
diff --git a/.travis.yml b/.travis.yml
index 8b138ce..bf26395 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -39,3 +39,5 @@
   - if [ "${BUILD_TYPE}" == "Coverage" -a "${TRAVIS_OS_NAME}" == "linux" ]; then
       coveralls --include src --include include --gcov-options '\-lp' --root .. --build-root .;
     fi
+
+sudo: required
diff --git a/AUTHORS b/AUTHORS
index 5a4b355..0f93e01 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -8,6 +8,7 @@
 #
 # Please keep the list sorted.
 
+Albert Pretorius <pretoalb@gmail.com>
 Arne Beer <arne@twobeer.de>
 Christopher Seymour <chris.j.seymour@hotmail.com>
 David Coeurjolly <david.coeurjolly@liris.cnrs.fr>
@@ -16,7 +17,9 @@
 Evgeny Safronov <division494@gmail.com>
 Felix Homann <linuxaudio@showlabor.de>
 Google Inc.
+Ismael Jimenez Martinez <ismael.jimenez.martinez@gmail.com>
 JianXiong Zhou <zhoujianxiong2@gmail.com>
+Jussi Knuuttila <jussi.knuuttila@gmail.com>
 Kaito Udagawa <umireon@gmail.com>
 Lei Xu <eddyxu@gmail.com>
 Matt Clarkson <mattyclarkson@gmail.com>
diff --git a/Android.bp b/Android.bp
index f8c805e..14a598b 100644
--- a/Android.bp
+++ b/Android.bp
@@ -32,6 +32,7 @@
         "src/benchmark.cc",
         "src/colorprint.cc",
         "src/commandlineflags.cc",
+        "src/complexity.cc",
         "src/console_reporter.cc",
         "src/csv_reporter.cc",
         "src/json_reporter.cc",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2c72252..a1251e7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,6 +3,7 @@
 
 foreach(p
     CMP0054 # CMake 3.1
+    CMP0056 # export EXE_LINKER_FLAGS to try_run
     )
   if(POLICY ${p})
     cmake_policy(SET ${p} NEW)
@@ -33,15 +34,29 @@
 
 if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
   # Turn compiler warnings up to 11
-  add_cxx_compiler_flag(-W4)
+  string(REGEX REPLACE "[-/]W[1-4]" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4")
   add_definitions(-D_CRT_SECURE_NO_WARNINGS)
 
   # Link time optimisation
   if (BENCHMARK_ENABLE_LTO)
-    set(CMAKE_CXX_FLAGS_RELEASE "/GL")
-    set(CMAKE_STATIC_LINKER_FLAGS_RELEASE "/LTCG")
-    set(CMAKE_SHARED_LINKER_FLAGS_RELEASE "/LTCG")
-    set(CMAKE_EXE_LINKER_FLAGS_RELEASE "/LTCG")
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /GL")
+    set(CMAKE_STATIC_LINKER_FLAGS_RELEASE "${CMAKE_STATIC_LINKER_FLAGS_RELEASE} /LTCG")
+    set(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /LTCG")
+    set(CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} /LTCG")
+
+    set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /GL")
+    string(REGEX REPLACE "[-/]INCREMENTAL" "/INCREMENTAL:NO" CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO}")
+    set(CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO} /LTCG")
+    string(REGEX REPLACE "[-/]INCREMENTAL" "/INCREMENTAL:NO" CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO}")
+    set(CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO} /LTCG")
+    string(REGEX REPLACE "[-/]INCREMENTAL" "/INCREMENTAL:NO" CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO}")
+    set(CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO} /LTCG")
+
+    set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} /GL")
+    set(CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL "${CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL} /LTCG")
+    set(CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL "${CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL} /LTCG")
+    set(CMAKE_EXE_LINKER_FLAGS_MINSIZEREL "${CMAKE_EXE_LINKER_FLAGS_MINSIZEREL} /LTCG")
   endif()
 else()
   # Try and enable C++11. Don't use C++14 because it doesn't work in some
@@ -57,6 +72,8 @@
   add_cxx_compiler_flag(-Wextra)
   add_cxx_compiler_flag(-Wshadow)
   add_cxx_compiler_flag(-Werror RELEASE)
+  add_cxx_compiler_flag(-Werror RELWITHDEBINFO)
+  add_cxx_compiler_flag(-Werror MINSIZEREL)
   add_cxx_compiler_flag(-pedantic)
   add_cxx_compiler_flag(-pedantic-errors)
   add_cxx_compiler_flag(-Wshorten-64-to-32)
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
index ed55bcf..4bff126 100644
--- a/CONTRIBUTORS
+++ b/CONTRIBUTORS
@@ -22,16 +22,22 @@
 #
 # Please keep the list sorted.
 
+Albert Pretorius <pretoalb@gmail.com>
 Arne Beer <arne@twobeer.de>
+Billy Robert O'Neal III <billy.oneal@gmail.com> <bion@microsoft.com>
 Chris Kennelly <ckennelly@google.com> <ckennelly@ckennelly.com>
 Christopher Seymour <chris.j.seymour@hotmail.com>
 David Coeurjolly <david.coeurjolly@liris.cnrs.fr>
 Dominic Hamon <dma@stripysock.com>
+Eric Fiselier <eric@efcs.ca>
 Eugene Zhuk <eugene.zhuk@gmail.com>
 Evgeny Safronov <division494@gmail.com>
 Felix Homann <linuxaudio@showlabor.de>
+Ismael Jimenez Martinez <ismael.jimenez.martinez@gmail.com>
 JianXiong Zhou <zhoujianxiong2@gmail.com>
+Jussi Knuuttila <jussi.knuuttila@gmail.com>
 Kaito Udagawa <umireon@gmail.com>
+Kai Wolf <kai.wolf@gmail.com>
 Lei Xu <eddyxu@gmail.com>
 Matt Clarkson <mattyclarkson@gmail.com>
 Oleksandr Sochka <sasha.sochka@gmail.com>
diff --git a/README.md b/README.md
index 21ae478..a0bcc61 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,4 @@
-benchmark
-=========
+# benchmark
 [![Build Status](https://travis-ci.org/google/benchmark.svg?branch=master)](https://travis-ci.org/google/benchmark)
 [![Build status](https://ci.appveyor.com/api/projects/status/u0qsyp7t1tk7cpxs/branch/master?svg=true)](https://ci.appveyor.com/project/google/benchmark/branch/master)
 [![Coverage Status](https://coveralls.io/repos/google/benchmark/badge.svg)](https://coveralls.io/r/google/benchmark)
@@ -10,10 +9,9 @@
 
 IRC channel: https://freenode.net #googlebenchmark
 
-Example usage
--------------
-Define a function that executes the code to be measured a
-specified number of times:
+## Example usage
+### Basic usage
+Define a function that executes the code to be measured.
 
 ```c++
 static void BM_StringCreation(benchmark::State& state) {
@@ -34,15 +32,16 @@
 BENCHMARK_MAIN();
 ```
 
-Sometimes a family of microbenchmarks can be implemented with
-just one routine that takes an extra argument to specify which
-one of the family of benchmarks to run.  For example, the following
-code defines a family of microbenchmarks for measuring the speed
-of `memcpy()` calls of different lengths:
+### Passing arguments
+Sometimes a family of benchmarks can be implemented with just one routine that
+takes an extra argument to specify which one of the family of benchmarks to
+run. For example, the following code defines a family of benchmarks for
+measuring the speed of `memcpy()` calls of different lengths:
 
 ```c++
 static void BM_memcpy(benchmark::State& state) {
-  char* src = new char[state.range_x()]; char* dst = new char[state.range_x()];
+  char* src = new char[state.range_x()];
+  char* dst = new char[state.range_x()];
   memset(src, 'x', state.range_x());
   while (state.KeepRunning())
     memcpy(dst, src, state.range_x());
@@ -54,18 +53,26 @@
 BENCHMARK(BM_memcpy)->Arg(8)->Arg(64)->Arg(512)->Arg(1<<10)->Arg(8<<10);
 ```
 
-The preceding code is quite repetitive, and can be replaced with the
-following short-hand.  The following invocation will pick a few
-appropriate arguments in the specified range and will generate a
-microbenchmark for each such argument.
+The preceding code is quite repetitive, and can be replaced with the following
+short-hand. The following invocation will pick a few appropriate arguments in
+the specified range and will generate a benchmark for each such argument.
 
 ```c++
 BENCHMARK(BM_memcpy)->Range(8, 8<<10);
 ```
 
-You might have a microbenchmark that depends on two inputs.  For
-example, the following code defines a family of microbenchmarks for
-measuring the speed of set insertion.
+By default the arguments in the range are generated in multiples of eight and
+the command above selects [ 8, 64, 512, 4k, 8k ]. In the following code the
+range multiplier is changed to multiples of two.
+
+```c++
+BENCHMARK(BM_memcpy)->RangeMultiplier(2)->Range(8, 8<<10);
+```
+Now arguments generated are [ 8, 16, 32, 64, 128, 256, 512, 1024, 2k, 4k, 8k ].
+
+You might have a benchmark that depends on two inputs. For example, the
+following code defines a family of benchmarks for measuring the speed of set
+insertion.
 
 ```c++
 static void BM_SetInsert(benchmark::State& state) {
@@ -88,19 +95,18 @@
     ->ArgPair(8<<10, 512);
 ```
 
-The preceding code is quite repetitive, and can be replaced with
-the following short-hand.  The following macro will pick a few
-appropriate arguments in the product of the two specified ranges
-and will generate a microbenchmark for each such pair.
+The preceding code is quite repetitive, and can be replaced with the following
+short-hand. The following macro will pick a few appropriate arguments in the
+product of the two specified ranges and will generate a benchmark for each such
+pair.
 
 ```c++
 BENCHMARK(BM_SetInsert)->RangePair(1<<10, 8<<10, 1, 512);
 ```
 
-For more complex patterns of inputs, passing a custom function
-to Apply allows programmatic specification of an
-arbitrary set of arguments to run the microbenchmark on.
-The following example enumerates a dense range on one parameter,
+For more complex patterns of inputs, passing a custom function to `Apply` allows
+programmatic specification of an arbitrary set of arguments on which to run the
+benchmark. The following example enumerates a dense range on one parameter,
 and a sparse range on the second.
 
 ```c++
@@ -112,9 +118,44 @@
 BENCHMARK(BM_SetInsert)->Apply(CustomArguments);
 ```
 
-Templated microbenchmarks work the same way:
-Produce then consume 'size' messages 'iters' times
-Measures throughput in the absence of multiprogramming.
+### Calculate asymptotic complexity (Big O)
+Asymptotic complexity might be calculated for a family of benchmarks. The
+following code will calculate the coefficient for the high-order term in the
+running time and the normalized root-mean square error of string comparison.
+
+```c++
+static void BM_StringCompare(benchmark::State& state) {
+  std::string s1(state.range_x(), '-');
+  std::string s2(state.range_x(), '-');
+  while (state.KeepRunning()) {
+    benchmark::DoNotOptimize(s1.compare(s2));
+  }
+  state.SetComplexityN(state.range_x());
+}
+BENCHMARK(BM_StringCompare)
+    ->RangeMultiplier(2)->Range(1<<10, 1<<18)->Complexity(benchmark::oN);
+```
+
+As shown in the following invocation, asymptotic complexity might also be
+calculated automatically.
+
+```c++
+BENCHMARK(BM_StringCompare)
+    ->RangeMultiplier(2)->Range(1<<10, 1<<18)->Complexity();
+```
+
+The following code will specify asymptotic complexity with a lambda function,
+that might be used to customize high-order term calculation.
+
+```c++
+BENCHMARK(BM_StringCompare)->RangeMultiplier(2)
+    ->Range(1<<10, 1<<18)->Complexity([](int n)->double{return n; });
+```
+
+### Templated benchmarks
+Templated benchmarks work the same way: This example produces and consumes
+messages of size `sizeof(v)` `range_x` times. It also outputs throughput in the
+absence of multiprogramming.
 
 ```c++
 template <class Q> int BM_Sequential(benchmark::State& state) {
@@ -145,11 +186,32 @@
 #define BENCHMARK_TEMPLATE2(func, arg1, arg2)
 ```
 
+## Passing arbitrary arguments to a benchmark
+In C++11 it is possible to define a benchmark that takes an arbitrary number
+of extra arguments. The `BENCHMARK_CAPTURE(func, test_case_name, ...args)`
+macro creates a benchmark that invokes `func`  with the `benchmark::State` as
+the first argument followed by the specified `args...`.
+The `test_case_name` is appended to the name of the benchmark and
+should describe the values passed.
+
+```c++
+template <class ...ExtraArgs>`
+void BM_takes_args(benchmark::State& state, ExtraArgs&&... extra_args) {
+  [...]
+}
+// Registers a benchmark named "BM_takes_args/int_string_test` that passes
+// the specified values to `extra_args`.
+BENCHMARK_CAPTURE(BM_takes_args, int_string_test, 42, std::string("abc"));
+```
+Note that elements of `...args` may refer to global variables. Users should
+avoid modifying global state inside of a benchmark.
+
+### Multithreaded benchmarks
 In a multithreaded test (benchmark invoked by multiple threads simultaneously),
 it is guaranteed that none of the threads will start until all have called
-KeepRunning, and all will have finished before KeepRunning returns false. As
-such, any global setup or teardown you want to do can be
-wrapped in a check against the thread index:
+`KeepRunning`, and all will have finished before KeepRunning returns false. As
+such, any global setup or teardown can be wrapped in a check against the thread
+index:
 
 ```c++
 static void BM_MultiThreaded(benchmark::State& state) {
@@ -176,8 +238,49 @@
 
 Without `UseRealTime`, CPU time is used by default.
 
+
+## Manual timing
+For benchmarking something for which neither CPU time nor real-time are
+correct or accurate enough, completely manual timing is supported using
+the `UseManualTime` function. 
+
+When `UseManualTime` is used, the benchmarked code must call
+`SetIterationTime` once per iteration of the `KeepRunning` loop to
+report the manually measured time.
+
+An example use case for this is benchmarking GPU execution (e.g. OpenCL
+or CUDA kernels, OpenGL or Vulkan or Direct3D draw calls), which cannot
+be accurately measured using CPU time or real-time. Instead, they can be
+measured accurately using a dedicated API, and these measurement results
+can be reported back with `SetIterationTime`.
+
+```c++
+static void BM_ManualTiming(benchmark::State& state) {
+  int microseconds = state.range_x();
+  std::chrono::duration<double, std::micro> sleep_duration {
+    static_cast<double>(microseconds)
+  };
+
+  while (state.KeepRunning()) {
+    auto start = std::chrono::high_resolution_clock::now();
+    // Simulate some useful workload with a sleep
+    std::this_thread::sleep_for(sleep_duration);
+    auto end   = std::chrono::high_resolution_clock::now();
+
+    auto elapsed_seconds =
+      std::chrono::duration_cast<std::chrono::duration<double>>(
+        end - start);
+
+    state.SetIterationTime(elapsed_seconds.count());
+  }
+}
+BENCHMARK(BM_ManualTiming)->Range(1, 1<<17)->UseManualTime();
+```
+
+### Preventing optimisation
 To prevent a value or expression from being optimized away by the compiler
-the `benchmark::DoNotOptimize(...)` function can be used.
+the `benchmark::DoNotOptimize(...)` and `benchmark::ClobberMemory()`
+functions can be used.
 
 ```c++
 static void BM_test(benchmark::State& state) {
@@ -190,8 +293,77 @@
 }
 ```
 
-Benchmark Fixtures
-------------------
+`DoNotOptimize(<expr>)` forces the  *result* of `<expr>` to be stored in either
+memory or a register. For GNU based compilers it acts as read/write barrier
+for global memory. More specifically it forces the compiler to flush pending
+writes to memory and reload any other values as necessary.
+
+Note that `DoNotOptimize(<expr>)` does not prevent optimizations on `<expr>`
+in any way. `<expr>` may even be removed entirely when the result is already
+known. For example:
+
+```c++
+  /* Example 1: `<expr>` is removed entirely. */
+  int foo(int x) { return x + 42; }
+  while (...) DoNotOptimize(foo(0)); // Optimized to DoNotOptimize(42);
+
+  /*  Example 2: Result of '<expr>' is only reused */
+  int bar(int) __attribute__((const));
+  while (...) DoNotOptimize(bar(0)); // Optimized to:
+  // int __result__ = bar(0);
+  // while (...) DoNotOptimize(__result__);
+```
+
+The second tool for preventing optimizations is `ClobberMemory()`. In essence
+`ClobberMemory()` forces the compiler to perform all pending writes to global
+memory. Memory managed by block scope objects must be "escaped" using
+`DoNotOptimize(...)` before it can be clobbered. In the below example
+`ClobberMemory()` prevents the call to `v.push_back(42)` from being optimized
+away.
+
+```c++
+static void BM_vector_push_back(benchmark::State& state) {
+  while (state.KeepRunning()) {
+    std::vector<int> v;
+    v.reserve(1);
+    benchmark::DoNotOptimize(v.data()); // Allow v.data() to be clobbered.
+    v.push_back(42);
+    benchmark::ClobberMemory(); // Force 42 to be written to memory.
+  }
+}
+```
+
+Note that `ClobberMemory()` is only available for GNU based compilers.
+
+### Set time unit manually
+If a benchmark runs a few milliseconds it may be hard to visually compare the
+measured times, since the output data is given in nanoseconds per default. In
+order to manually set the time unit, you can specify it manually:
+
+```c++
+BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
+```
+
+## Controlling number of iterations
+In all cases, the number of iterations for which the benchmark is run is
+governed by the amount of time the benchmark takes. Concretely, the number of
+iterations is at least one, not more than 1e9, until CPU time is greater than
+the minimum time, or the wallclock time is 5x minimum time. The minimum time is
+set as a flag `--benchmark_min_time` or per benchmark by calling `MinTime` on
+the registered benchmark object.
+
+## Reporting the mean and standard devation by repeated benchmarks
+By default each benchmark is run once and that single result is reported.
+However benchmarks are often noisy and a single result may not be representative
+of the overall behavior. For this reason it's possible to repeatedly rerun the
+benchmark.
+
+The number of runs of each benchmark is specified globally by the
+`--benchmark_repetitions` flag or on a per benchmark basis by calling
+`Repetitions` on the registered benchmark object. When a benchmark is run
+more than once the mean and standard deviation of the runs will be reported.
+
+## Fixtures
 Fixture tests are created by
 first defining a type that derives from ::benchmark::Fixture and then
 creating/registering the tests using the following macros:
@@ -221,10 +393,41 @@
 /* BarTest is now registered */
 ```
 
-Output Formats
---------------
+## Exiting Benchmarks in Error
+
+When errors caused by external influences, such as file I/O and network
+communication, occur within a benchmark the
+`State::SkipWithError(const char* msg)` function can be used to skip that run
+of benchmark and report the error. Note that only future iterations of the
+`KeepRunning()` are skipped. Users may explicitly return to exit the
+benchmark immediately.
+
+The `SkipWithError(...)` function may be used at any point within the benchmark,
+including before and after the `KeepRunning()` loop.
+
+For example:
+
+```c++
+static void BM_test(benchmark::State& state) {
+  auto resource = GetResource();
+  if (!resource.good()) {
+      state.SkipWithError("Resource is not good!");
+      // KeepRunning() loop will not be entered.
+  }
+  while (state.KeepRunning()) {
+      auto data = resource.read_data();
+      if (!resource.good()) {
+        state.SkipWithError("Failed to read data!");
+        break; // Needed to skip the rest of the iteration.
+     }
+     do_stuff(data);
+  }
+}
+```
+
+## Output Formats
 The library supports multiple output formats. Use the
-`--benchmark_format=<tabular|json>` flag to set the format type. `tabular` is
+`--benchmark_format=<tabular|json|csv>` flag to set the format type. `tabular` is
 the default format.
 
 The Tabular format is intended to be a human readable format. By default
@@ -290,8 +493,7 @@
 "BM_SetInsert/1024/10",106365,17238.4,8421.53,4.74973e+06,1.18743e+06,
 ```
 
-Debug vs Release
-----------------
+## Debug vs Release
 By default, benchmark builds as a debug library. You will see a warning in the output when this is the case. To build it as a release library instead, use:
 
 ```
@@ -304,6 +506,5 @@
 cmake -DCMAKE_BUILD_TYPE=Release -DBENCHMARK_ENABLE_LTO=true
 ```
 
-Linking against the library
----------------------------
+## Linking against the library
 When using gcc, it is necessary to link against pthread to avoid runtime exceptions. This is due to how gcc implements std::thread. See [issue #67](https://github.com/google/benchmark/issues/67) for more details.
diff --git a/cmake/CXXFeatureCheck.cmake b/cmake/CXXFeatureCheck.cmake
index 23ee8ac..3059024 100644
--- a/cmake/CXXFeatureCheck.cmake
+++ b/cmake/CXXFeatureCheck.cmake
@@ -21,12 +21,15 @@
   string(TOLOWER ${FILE} FILE)
   string(TOUPPER ${FILE} VAR)
   string(TOUPPER "HAVE_${VAR}" FEATURE)
+  if (DEFINED HAVE_${VAR})
+    return()
+  endif()
   message("-- Performing Test ${FEATURE}")
   try_run(RUN_${FEATURE} COMPILE_${FEATURE}
           ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${FILE}.cpp)
   if(RUN_${FEATURE} EQUAL 0)
     message("-- Performing Test ${FEATURE} -- success")
-    set(HAVE_${VAR} 1 PARENT_SCOPE)
+    set(HAVE_${VAR} 1 CACHE INTERNAL "Feature test for ${FILE}" PARENT_SCOPE)
     add_definitions(-DHAVE_${VAR})
   else()
     if(NOT COMPILE_${FEATURE})
diff --git a/cmake/posix_regex.cpp b/cmake/posix_regex.cpp
index a31af80..466dc62 100644
--- a/cmake/posix_regex.cpp
+++ b/cmake/posix_regex.cpp
@@ -7,6 +7,8 @@
   if (ec != 0) {
     return ec;
   }
-  return regexec(&re, str.c_str(), 0, nullptr, 0) ? -1 : 0;
+  int ret = regexec(&re, str.c_str(), 0, nullptr, 0) ? -1 : 0;
+  regfree(&re);
+  return ret;
 }
 
diff --git a/include/benchmark/benchmark_api.h b/include/benchmark/benchmark_api.h
index 0c5115d..664ca2a 100644
--- a/include/benchmark/benchmark_api.h
+++ b/include/benchmark/benchmark_api.h
@@ -137,6 +137,13 @@
   }
 }
 BENCHMARK(BM_MultiThreaded)->Threads(4);
+
+
+If a benchmark runs a few milliseconds it may be hard to visually compare the
+measured times, since the output data is given in nanoseconds per default. In
+order to manually set the time unit, you can specify it manually:
+
+BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
 */
 
 #ifndef BENCHMARK_BENCHMARK_API_H_
@@ -153,10 +160,17 @@
 
 void Initialize(int* argc, char** argv);
 
-// Otherwise, run all benchmarks specified by the --benchmark_filter flag,
-// and exit after running the benchmarks.
-void RunSpecifiedBenchmarks();
-void RunSpecifiedBenchmarks(BenchmarkReporter* reporter);
+// Generate a list of benchmarks matching the specified --benchmark_filter flag
+// and if --benchmark_list_tests is specified return after printing the name
+// of each matching benchmark. Otherwise run each matching benchmark and
+// report the results.
+//
+// The second overload reports the results using the specified 'reporter'.
+//
+// RETURNS: The number of matching benchmarks.
+size_t RunSpecifiedBenchmarks();
+size_t RunSpecifiedBenchmarks(BenchmarkReporter* reporter);
+
 
 // If this routine is called, peak memory allocation past this point in the
 // benchmark is reported at the end of the benchmark report line. (It is
@@ -193,61 +207,90 @@
 
 // The DoNotOptimize(...) function can be used to prevent a value or
 // expression from being optimized away by the compiler. This function is
-// intented to add little to no overhead.
-// See: http://stackoverflow.com/questions/28287064
-#if defined(__clang__) && defined(__GNUC__)
-// TODO(ericwf): Clang has a bug where it tries to always use a register
-// even if value must be stored in memory. This causes codegen to fail.
-// To work around this we remove the "r" modifier so the operand is always
-// loaded into memory.
+// intended to add little to no overhead.
+// See: https://youtu.be/nXaxk27zwlk?t=2441
+#if defined(__GNUC__)
 template <class Tp>
 inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
-    asm volatile("" : "+m" (const_cast<Tp&>(value)));
+    asm volatile("" : : "g"(value) : "memory");
 }
-#elif defined(__GNUC__)
-template <class Tp>
-inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
-    asm volatile("" : "+rm" (const_cast<Tp&>(value)));
+// Force the compiler to flush pending writes to global memory. Acts as an
+// effective read/write barrier
+inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() {
+    asm volatile("" : : : "memory");
 }
 #else
 template <class Tp>
 inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
     internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value));
 }
+// FIXME Add ClobberMemory() for non-gnu compilers
 #endif
 
+// TimeUnit is passed to a benchmark in order to specify the order of magnitude
+// for the measured time.
+enum TimeUnit {
+  kNanosecond,
+  kMicrosecond,
+  kMillisecond
+};
+
+// BigO is passed to a benchmark in order to specify the asymptotic computational 
+// complexity for the benchmark. In case oAuto is selected, complexity will be 
+// calculated automatically to the best fit.
+enum BigO {
+  oNone,
+  o1,
+  oN,
+  oNSquared,
+  oNCubed,
+  oLogN,
+  oNLogN,
+  oAuto,
+  oLambda
+};
+
+// BigOFunc is passed to a benchmark in order to specify the asymptotic 
+// computational complexity for the benchmark.
+typedef double(BigOFunc)(int);
 
 // State is passed to a running Benchmark and contains state for the
 // benchmark to use.
 class State {
 public:
-  State(size_t max_iters, bool has_x, int x, bool has_y, int y, int thread_i, int n_threads);
+  State(size_t max_iters, bool has_x, int x, bool has_y, int y,
+        int thread_i, int n_threads);
 
-  // Returns true iff the benchmark should continue through another iteration.
+  // Returns true if the benchmark should continue through another iteration.
   // NOTE: A benchmark may not return from the test until KeepRunning() has
   // returned false.
   bool KeepRunning() {
     if (BENCHMARK_BUILTIN_EXPECT(!started_, false)) {
-        ResumeTiming();
-        started_ = true;
+      assert(!finished_);
+      started_ = true;
+      ResumeTiming();
     }
     bool const res = total_iterations_++ < max_iterations;
     if (BENCHMARK_BUILTIN_EXPECT(!res, false)) {
-        assert(started_);
+      assert(started_ && (!finished_ || error_occurred_));
+      if (!error_occurred_) {
         PauseTiming();
-        // Total iterations now is one greater than max iterations. Fix this.
-        total_iterations_ = max_iterations;
+      }
+      // Total iterations now is one greater than max iterations. Fix this.
+      total_iterations_ = max_iterations;
+      finished_ = true;
     }
     return res;
   }
 
-  // REQUIRES: timer is running
+  // REQUIRES: timer is running and 'SkipWithError(...)' has not been called
+  //           by the current thread.
   // Stop the benchmark timer.  If not called, the timer will be
   // automatically stopped after KeepRunning() returns false for the first time.
   //
   // For threaded benchmarks the PauseTiming() function acts
   // like a barrier.  I.e., the ith call by a particular thread to this
-  // function will block until all threads have made their ith call.
+  // function will block until all active threads have made their ith call.
   // The timer will stop when the last thread has called this function.
   //
   // NOTE: PauseTiming()/ResumeTiming() are relatively
@@ -255,13 +298,14 @@
   // within each benchmark iteration, if possible.
   void PauseTiming();
 
-  // REQUIRES: timer is not running
+  // REQUIRES: timer is not running and 'SkipWithError(...)' has not been called
+  //           by the current thread.
   // Start the benchmark timer.  The timer is NOT running on entrance to the
   // benchmark function. It begins running after the first call to KeepRunning()
   //
   // For threaded benchmarks the ResumeTiming() function acts
   // like a barrier.  I.e., the ith call by a particular thread to this
-  // function will block until all threads have made their ith call.
+  // function will block until all active threads have made their ith call.
   // The timer will start when the last thread has called this function.
   //
   // NOTE: PauseTiming()/ResumeTiming() are relatively
@@ -269,6 +313,34 @@
   // within each benchmark iteration, if possible.
   void ResumeTiming();
 
+  // REQUIRES: 'SkipWithError(...)' has not been called previously by the
+  //            current thread.
+  // Skip any future iterations of the 'KeepRunning()' loop in the current
+  // thread and report an error with the specified 'msg'. After this call
+  // the user may explicitly 'return' from the benchmark.
+  //
+  // For threaded benchmarks only the current thread stops executing. If
+  // multiple threads report an error only the first error message is used.
+  // The current thread is no longer considered 'active' by
+  // 'PauseTiming()' and 'ResumingTiming()'.
+  //
+  // NOTE: Calling 'SkipWithError(...)' does not cause the benchmark to exit
+  // the current scope immediately. If the function is called from within
+  // the 'KeepRunning()' loop the current iteration will finish. It is the users
+  // responsibility to exit the scope as needed.
+  void SkipWithError(const char* msg);
+
+  // REQUIRES: called exactly once per iteration of the KeepRunning loop.
+  // Set the manually measured time for this benchmark iteration, which
+  // is used instead of automatically measured time if UseManualTime() was
+  // specified.
+  //
+  // For threaded benchmarks the SetIterationTime() function acts
+  // like a barrier.  I.e., the ith call by a particular thread to this
+  // function will block until all threads have made their ith call.
+  // The time will be set by the last thread to call this function.
+  void SetIterationTime(double seconds);
+
   // Set the number of bytes processed by the current benchmark
   // execution.  This routine is typically called once at the end of a
   // throughput oriented benchmark.  If this routine is called with a
@@ -286,6 +358,19 @@
     return bytes_processed_;
   }
 
+  // If this routine is called with complexity_n > 0 and complexity report is requested for the 
+  // family benchmark, then current benchmark will be part of the computation and complexity_n will
+  // represent the length of N.
+  BENCHMARK_ALWAYS_INLINE
+  void SetComplexityN(int complexity_n) {
+    complexity_n_ = complexity_n;
+  }
+
+  BENCHMARK_ALWAYS_INLINE
+  size_t complexity_length_n() {
+    return complexity_n_;
+  }
+
   // If this routine is called with items > 0, then an items/s
   // label is printed on the benchmark report line for the currently
   // executing benchmark. It is typically called at the end of a processing
@@ -305,10 +390,10 @@
   // If this routine is called, the specified label is printed at the
   // end of the benchmark report line for the currently executing
   // benchmark.  Example:
-  //  static void BM_Compress(int iters) {
+  //  static void BM_Compress(benchmark::State& state) {
   //    ...
   //    double compress = input_size / output_size;
-  //    benchmark::SetLabel(StringPrintf("compress:%.1f%%", 100.0*compression));
+  //    state.SetLabel(StringPrintf("compress:%.1f%%", 100.0*compression));
   //  }
   // Produces output that looks like:
   //  BM_Compress   50         50   14115038  compress:27.3%
@@ -346,6 +431,7 @@
 
 private:
   bool started_;
+  bool finished_;
   size_t total_iterations_;
 
   bool has_range_x_;
@@ -357,6 +443,11 @@
   size_t bytes_processed_;
   size_t items_processed_;
 
+  int complexity_n_;
+
+public:
+  // FIXME: Make this private somehow.
+  bool error_occurred_;
 public:
   // Index of the executing thread. Values from [0, threads).
   const int thread_index;
@@ -390,6 +481,9 @@
   // REQUIRES: The function passed to the constructor must accept an arg1.
   Benchmark* Arg(int x);
 
+  // Run this benchmark with the given time unit for the generated output report
+  Benchmark* Unit(TimeUnit unit);
+
   // Run this benchmark once for a number of values picked from the
   // range [start..limit].  (start and limit are always picked.)
   // REQUIRES: The function passed to the constructor must accept an arg1.
@@ -416,10 +510,20 @@
   // Threads, etc.
   Benchmark* Apply(void (*func)(Benchmark* benchmark));
 
+  // Set the range multiplier for non-dense range. If not called, the range multiplier 
+  // kRangeMultiplier will be used.
+  Benchmark* RangeMultiplier(int multiplier);
+
   // Set the minimum amount of time to use when running this benchmark. This
   // option overrides the `benchmark_min_time` flag.
+  // REQUIRES: `t > 0`
   Benchmark* MinTime(double t);
 
+  // Specify the amount of times to repeat this benchmark. This option overrides
+  // the `benchmark_repetitions` flag.
+  // REQUIRES: `n > 0`
+  Benchmark* Repetitions(int n);
+
   // If a particular benchmark is I/O bound, runs multiple threads internally or
   // if for some reason CPU timings are not representative, call this method. If
   // called, the elapsed time will be used to control how many iterations are
@@ -427,6 +531,21 @@
   // called, the cpu time used by the benchmark will be used.
   Benchmark* UseRealTime();
 
+  // If a benchmark must measure time manually (e.g. if GPU execution time is being
+  // measured), call this method. If called, each benchmark iteration should call
+  // SetIterationTime(seconds) to report the measured time, which will be used
+  // to control how many iterations are run, and in the printing of items/second
+  // or MB/second values.
+  Benchmark* UseManualTime();
+
+  // Set the asymptotic computational complexity for the benchmark. If called
+  // the asymptotic computational complexity will be shown on the output. 
+  Benchmark* Complexity(BigO complexity = benchmark::oAuto);
+
+  // Set the asymptotic computational complexity for the benchmark. If called
+  // the asymptotic computational complexity will be shown on the output.
+  Benchmark* Complexity(BigOFunc* complexity);
+
   // Support for running multiple copies of the same benchmark concurrently
   // in multiple threads.  This may be useful when measuring the scaling
   // of some piece of code.
@@ -491,11 +610,11 @@
     virtual void Run(State& st) {
       this->SetUp(st);
       this->BenchmarkCase(st);
-      this->TearDown();
+      this->TearDown(st);
     }
 
     virtual void SetUp(const State&) {}
-    virtual void TearDown() {}
+    virtual void TearDown(const State&) {}
 
 protected:
     virtual void BenchmarkCase(State&) = 0;
@@ -534,10 +653,33 @@
 // Old-style macros
 #define BENCHMARK_WITH_ARG(n, a) BENCHMARK(n)->Arg((a))
 #define BENCHMARK_WITH_ARG2(n, a1, a2) BENCHMARK(n)->ArgPair((a1), (a2))
+#define BENCHMARK_WITH_UNIT(n, t) BENCHMARK(n)->Unit((t))
 #define BENCHMARK_RANGE(n, lo, hi) BENCHMARK(n)->Range((lo), (hi))
 #define BENCHMARK_RANGE2(n, l1, h1, l2, h2) \
   BENCHMARK(n)->RangePair((l1), (h1), (l2), (h2))
 
+#if __cplusplus >= 201103L
+
+// Register a benchmark which invokes the function specified by `func`
+// with the additional arguments specified by `...`.
+//
+// For example:
+//
+// template <class ...ExtraArgs>`
+// void BM_takes_args(benchmark::State& state, ExtraArgs&&... extra_args) {
+//  [...]
+//}
+// /* Registers a benchmark named "BM_takes_args/int_string_test` */
+// BENCHMARK_CAPTURE(BM_takes_args, int_string_test, 42, std::string("abc"));
+#define BENCHMARK_CAPTURE(func, test_case_name, ...)                       \
+    BENCHMARK_PRIVATE_DECLARE(func) =                                      \
+        (::benchmark::internal::RegisterBenchmarkInternal(                 \
+            new ::benchmark::internal::FunctionBenchmark(                  \
+                    #func "/" #test_case_name,                             \
+                    [](::benchmark::State& st) { func(st, __VA_ARGS__); })))
+
+#endif // __cplusplus >= 11
+
 // This will register a benchmark for a templatized function.  For example:
 //
 // template<int arg>
diff --git a/include/benchmark/macros.h b/include/benchmark/macros.h
index 3e9540e..09d13c1 100644
--- a/include/benchmark/macros.h
+++ b/include/benchmark/macros.h
@@ -28,15 +28,23 @@
 # define BENCHMARK_UNUSED __attribute__((unused))
 # define BENCHMARK_ALWAYS_INLINE __attribute__((always_inline))
 # define BENCHMARK_NOEXCEPT noexcept
+# define BENCHMARK_NOEXCEPT_OP(x) noexcept(x)
 #elif defined(_MSC_VER) && !defined(__clang__)
 # define BENCHMARK_UNUSED
 # define BENCHMARK_ALWAYS_INLINE __forceinline
-# define BENCHMARK_NOEXCEPT
+# if _MSC_VER >= 1900
+#  define BENCHMARK_NOEXCEPT noexcept
+#  define BENCHMARK_NOEXCEPT_OP(x) noexcept(x)
+# else
+#  define BENCHMARK_NOEXCEPT
+#  define BENCHMARK_NOEXCEPT_OP(x)
+# endif
 # define __func__ __FUNCTION__
 #else
 # define BENCHMARK_UNUSED
 # define BENCHMARK_ALWAYS_INLINE
 # define BENCHMARK_NOEXCEPT
+# define BENCHMARK_NOEXCEPT_OP(x)
 #endif
 
 #if defined(__GNUC__)
diff --git a/include/benchmark/reporter.h b/include/benchmark/reporter.h
index d23ab65..22c97a0 100644
--- a/include/benchmark/reporter.h
+++ b/include/benchmark/reporter.h
@@ -14,11 +14,13 @@
 #ifndef BENCHMARK_REPORTER_H_
 #define BENCHMARK_REPORTER_H_
 
+#include <cassert>
+#include <iosfwd>
 #include <string>
 #include <utility>
 #include <vector>
 
-#include "benchmark_api.h" // For forward declaration of BenchmarkReporter
+#include "benchmark_api.h"  // For forward declaration of BenchmarkReporter
 
 namespace benchmark {
 
@@ -40,27 +42,62 @@
 
   struct Run {
     Run() :
+      error_occurred(false),
       iterations(1),
+      time_unit(kNanosecond),
       real_accumulated_time(0),
       cpu_accumulated_time(0),
       bytes_per_second(0),
       items_per_second(0),
-      max_heapbytes_used(0) {}
+      max_heapbytes_used(0),
+      complexity(oNone),
+      complexity_n(0),
+      report_big_o(false),
+      report_rms(false) {}
 
     std::string benchmark_name;
     std::string report_label;  // Empty if not set by benchmark.
+    bool error_occurred;
+    std::string error_message;
+
     int64_t iterations;
+    TimeUnit time_unit;
     double real_accumulated_time;
     double cpu_accumulated_time;
 
+    // Return a value representing the real time per iteration in the unit
+    // specified by 'time_unit'.
+    // NOTE: If 'iterations' is zero the returned value represents the
+    // accumulated time.
+    double GetAdjustedRealTime() const;
+
+    // Return a value representing the cpu time per iteration in the unit
+    // specified by 'time_unit'.
+    // NOTE: If 'iterations' is zero the returned value represents the
+    // accumulated time.
+    double GetAdjustedCPUTime() const;
+
     // Zero if not set by benchmark.
     double bytes_per_second;
     double items_per_second;
 
     // This is set to 0.0 if memory tracing is not enabled.
     double max_heapbytes_used;
+
+    // Keep track of arguments to compute asymptotic complexity
+    BigO complexity;
+    BigOFunc* complexity_lambda;
+    int complexity_n;
+
+    // Inform print function whether the current run is a complexity report
+    bool report_big_o;
+    bool report_rms;
   };
 
+  // Construct a BenchmarkReporter with the output stream set to 'std::cout'
+  // and the error stream set to 'std::cerr'
+  BenchmarkReporter();
+
   // Called once for every suite of benchmarks run.
   // The parameter "context" contains information that the
   // reporter may wish to use when generating its report, for example the
@@ -70,18 +107,50 @@
   virtual bool ReportContext(const Context& context) = 0;
 
   // Called once for each group of benchmark runs, gives information about
-  // cpu-time and heap memory usage during the benchmark run.
-  // Note that all the grouped benchmark runs should refer to the same
-  // benchmark, thus have the same name.
+  // cpu-time and heap memory usage during the benchmark run. If the group
+  // of runs contained more than two entries then 'report' contains additional
+  // elements representing the mean and standard deviation of those runs.
+  // Additionally if this group of runs was the last in a family of benchmarks
+  // 'reports' contains additional entries representing the asymptotic
+  // complexity and RMS of that benchmark family.
   virtual void ReportRuns(const std::vector<Run>& report) = 0;
 
   // Called once and only once after ever group of benchmarks is run and
   // reported.
-  virtual void Finalize();
+  virtual void Finalize() {}
+
+  // REQUIRES: The object referenced by 'out' is valid for the lifetime
+  // of the reporter.
+  void SetOutputStream(std::ostream* out) {
+    assert(out);
+    output_stream_ = out;
+  }
+
+  // REQUIRES: The object referenced by 'err' is valid for the lifetime
+  // of the reporter.
+  void SetErrorStream(std::ostream* err) {
+    assert(err);
+    error_stream_ = err;
+  }
+
+  std::ostream& GetOutputStream() const {
+    return *output_stream_;
+  }
+
+  std::ostream& GetErrorStream() const {
+    return *error_stream_;
+  }
 
   virtual ~BenchmarkReporter();
-protected:
-    static void ComputeStats(std::vector<Run> const& reports, Run* mean, Run* stddev);
+
+  // Write a human readable string to 'out' representing the specified
+  // 'context'.
+  // REQUIRES: 'out' is non-null.
+  static void PrintBasicContext(std::ostream* out, Context const& context);
+
+ private:
+  std::ostream* output_stream_;
+  std::ostream* error_stream_;
 };
 
 // Simple reporter that outputs benchmark data to the console. This is the
@@ -90,33 +159,58 @@
  public:
   virtual bool ReportContext(const Context& context);
   virtual void ReportRuns(const std::vector<Run>& reports);
-protected:
+
+ protected:
   virtual void PrintRunData(const Run& report);
 
   size_t name_field_width_;
 };
 
 class JSONReporter : public BenchmarkReporter {
-public:
+ public:
   JSONReporter() : first_report_(true) {}
   virtual bool ReportContext(const Context& context);
   virtual void ReportRuns(const std::vector<Run>& reports);
   virtual void Finalize();
 
-private:
+ private:
   void PrintRunData(const Run& report);
 
   bool first_report_;
 };
 
 class CSVReporter : public BenchmarkReporter {
-public:
+ public:
   virtual bool ReportContext(const Context& context);
   virtual void ReportRuns(const std::vector<Run>& reports);
 
-private:
+ private:
   void PrintRunData(const Run& report);
 };
 
-} // end namespace benchmark
-#endif // BENCHMARK_REPORTER_H_
+inline const char* GetTimeUnitString(TimeUnit unit) {
+  switch (unit) {
+    case kMillisecond:
+      return "ms";
+    case kMicrosecond:
+      return "us";
+    case kNanosecond:
+    default:
+      return "ns";
+  }
+}
+
+inline double GetTimeUnitMultiplier(TimeUnit unit) {
+  switch (unit) {
+    case kMillisecond:
+      return 1e3;
+    case kMicrosecond:
+      return 1e6;
+    case kNanosecond:
+    default:
+      return 1e9;
+  }
+}
+
+}  // end namespace benchmark
+#endif  // BENCHMARK_REPORTER_H_
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 811d075..6dab64b 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -5,7 +5,7 @@
 set(SOURCE_FILES "benchmark.cc" "colorprint.cc" "commandlineflags.cc"
                  "console_reporter.cc" "csv_reporter.cc" "json_reporter.cc"
                  "log.cc" "reporter.cc" "sleep.cc" "string_util.cc"
-                 "sysinfo.cc" "walltime.cc")
+                 "sysinfo.cc" "walltime.cc" "complexity.cc")
 # Determine the correct regular expression engine to use
 if(HAVE_STD_REGEX)
   set(RE_FILES "re_std.cc")
diff --git a/src/benchmark.cc b/src/benchmark.cc
index 08b180e..cb8e132 100644
--- a/src/benchmark.cc
+++ b/src/benchmark.cc
@@ -33,6 +33,7 @@
 
 #include "check.h"
 #include "commandlineflags.h"
+#include "complexity.h"
 #include "log.h"
 #include "mutex.h"
 #include "re.h"
@@ -64,9 +65,9 @@
              "The number of runs of each benchmark. If greater than 1, the "
              "mean and standard deviation of the runs will be reported.");
 
-DEFINE_string(benchmark_format, "tabular",
+DEFINE_string(benchmark_format, "console",
               "The format to use for console output. Valid values are "
-              "'tabular', 'json', or 'csv'.");
+              "'console', 'json', or 'csv'.");
 
 DEFINE_bool(color_print, true, "Enables colorized logging.");
 
@@ -112,13 +113,24 @@
     return &label;
 }
 
+// Global variable so that a benchmark can report an error as a human readable
+// string. If error_message is null no error occurred.
+#if defined(_MSC_VER) && _MSC_VER <= 1800
+typedef char* error_message_type;
+#else
+typedef const char* error_message_type;
+#endif
+
+static std::atomic<error_message_type> error_message = ATOMIC_VAR_INIT(nullptr);
+
 // TODO(ericwf): support MallocCounter.
 //static benchmark::MallocCounter *benchmark_mc;
 
 struct ThreadStats {
-    ThreadStats() : bytes_processed(0), items_processed(0) {}
+    ThreadStats() : bytes_processed(0), items_processed(0), complexity_n(0) {}
     int64_t bytes_processed;
     int64_t items_processed;
+    int  complexity_n;
 };
 
 // Timer management class
@@ -126,13 +138,16 @@
  public:
   TimerManager(int num_threads, Notification* done)
       : num_threads_(num_threads),
+        running_threads_(num_threads),
         done_(done),
         running_(false),
         real_time_used_(0),
         cpu_time_used_(0),
+        manual_time_used_(0),
         num_finalized_(0),
         phase_number_(0),
-        entered_(0) {
+        entered_(0)
+  {
   }
 
   // Called by each thread
@@ -170,6 +185,21 @@
   }
 
   // Called by each thread
+  void SetIterationTime(double seconds) EXCLUDES(lock_) {
+    bool last_thread = false;
+    {
+      MutexLock ml(lock_);
+      last_thread = Barrier(ml);
+      if (last_thread) {
+        manual_time_used_ += seconds;
+      }
+    }
+    if (last_thread) {
+      phase_condition_.notify_all();
+    }
+  }
+
+  // Called by each thread
   void Finalize() EXCLUDES(lock_) {
     MutexLock l(lock_);
     num_finalized_++;
@@ -180,6 +210,15 @@
     }
   }
 
+  void RemoveErroredThread() EXCLUDES(lock_) {
+    MutexLock ml(lock_);
+    int last_thread = --running_threads_ == 0;
+    if (last_thread && running_)
+      InternalStop();
+    else if (!last_thread)
+      phase_condition_.notify_all();
+  }
+
   // REQUIRES: timer is not running
   double real_time_used() EXCLUDES(lock_) {
     MutexLock l(lock_);
@@ -194,10 +233,18 @@
     return cpu_time_used_;
   }
 
+  // REQUIRES: timer is not running
+  double manual_time_used() EXCLUDES(lock_) {
+    MutexLock l(lock_);
+    CHECK(!running_);
+    return manual_time_used_;
+  }
+
  private:
   Mutex lock_;
   Condition phase_condition_;
   int num_threads_;
+  int running_threads_;
   Notification* done_;
 
   bool running_;                // Is the timer running
@@ -207,6 +254,8 @@
   // Accumulated time so far (does not contain current slice if running_)
   double real_time_used_;
   double cpu_time_used_;
+  // Manually set iteration time. User sets this with SetIterationTime(seconds).
+  double manual_time_used_;
 
   // How many threads have called Finalize()
   int num_finalized_;
@@ -227,22 +276,24 @@
   // entered the barrier.  Returns iff this is the last thread to
   // enter the barrier.
   bool Barrier(MutexLock& ml) REQUIRES(lock_) {
-    CHECK_LT(entered_, num_threads_);
+    CHECK_LT(entered_, running_threads_);
     entered_++;
-    if (entered_ < num_threads_) {
+    if (entered_ < running_threads_) {
       // Wait for all threads to enter
       int phase_number_cp = phase_number_;
       auto cb = [this, phase_number_cp]() {
-        return this->phase_number_ > phase_number_cp;
+        return this->phase_number_ > phase_number_cp ||
+               entered_ == running_threads_; // A thread has aborted in error
       };
       phase_condition_.wait(ml.native_handle(), cb);
-      return false;  // I was not the last one
-    } else {
-      // Last thread has reached the barrier
-      phase_number_++;
-      entered_ = 0;
-      return true;
+      if (phase_number_ > phase_number_cp)
+        return false;
+      // else (running_threads_ == entered_) and we are the last thread.
     }
+    // Last thread has reached the barrier
+    phase_number_++;
+    entered_ = 0;
+    return true;
   }
 };
 
@@ -261,7 +312,14 @@
   int            arg1;
   bool           has_arg2;
   int            arg2;
+  TimeUnit       time_unit;
+  int            range_multiplier;
   bool           use_real_time;
+  bool           use_manual_time;
+  BigO           complexity;
+  BigOFunc*      complexity_lambda;
+  bool           last_benchmark_instance;
+  int            repetitions;
   double         min_time;
   int            threads;    // Number of concurrent threads to use
   bool           multithreaded;  // Is benchmark multi-threaded?
@@ -294,12 +352,18 @@
   ~BenchmarkImp();
 
   void Arg(int x);
+  void Unit(TimeUnit unit);
   void Range(int start, int limit);
   void DenseRange(int start, int limit);
   void ArgPair(int start, int limit);
   void RangePair(int lo1, int hi1, int lo2, int hi2);
+  void RangeMultiplier(int multiplier);
   void MinTime(double n);
+  void Repetitions(int n);
   void UseRealTime();
+  void UseManualTime();
+  void Complexity(BigO complexity);
+  void ComplexityLambda(BigOFunc* complexity);
   void Threads(int t);
   void ThreadRange(int min_threads, int max_threads);
   void ThreadPerCpu();
@@ -313,8 +377,14 @@
   std::string name_;
   int arg_count_;
   std::vector< std::pair<int, int> > args_;  // Args for all benchmark runs
+  TimeUnit time_unit_;
+  int range_multiplier_;
   double min_time_;
+  int repetitions_;
   bool use_real_time_;
+  bool use_manual_time_;
+  BigO complexity_;
+  BigOFunc* complexity_lambda_;
   std::vector<int> thread_counts_;
 
   BenchmarkImp& operator=(BenchmarkImp const&);
@@ -372,8 +442,14 @@
         instance.arg1 = args.first;
         instance.has_arg2 = family->arg_count_ == 2;
         instance.arg2 = args.second;
+        instance.time_unit = family->time_unit_;
+        instance.range_multiplier = family->range_multiplier_;
         instance.min_time = family->min_time_;
+        instance.repetitions = family->repetitions_;
         instance.use_real_time = family->use_real_time_;
+        instance.use_manual_time = family->use_manual_time_;
+        instance.complexity = family->complexity_;
+        instance.complexity_lambda = family->complexity_lambda_;
         instance.threads = num_threads;
         instance.multithreaded = !(family->thread_counts_.empty());
 
@@ -387,7 +463,12 @@
         if (!IsZero(family->min_time_)) {
           instance.name +=  StringPrintF("/min_time:%0.3f",  family->min_time_);
         }
-        if (family->use_real_time_) {
+        if (family->repetitions_ != 0) {
+          instance.name +=  StringPrintF("/repeats:%d",  family->repetitions_);
+        }
+        if (family->use_manual_time_) {
+          instance.name +=  "/manual_time";
+        } else if (family->use_real_time_) {
           instance.name +=  "/real_time";
         }
 
@@ -397,6 +478,7 @@
         }
 
         if (re.Match(instance.name)) {
+          instance.last_benchmark_instance = (args == family->args_.back());
           benchmarks->push_back(instance);
         }
       }
@@ -406,8 +488,10 @@
 }
 
 BenchmarkImp::BenchmarkImp(const char* name)
-    : name_(name), arg_count_(-1),
-      min_time_(0.0), use_real_time_(false) {
+    : name_(name), arg_count_(-1), time_unit_(kNanosecond),
+      range_multiplier_(kRangeMultiplier), min_time_(0.0), repetitions_(0),
+      use_real_time_(false), use_manual_time_(false),
+      complexity_(oNone) {
 }
 
 BenchmarkImp::~BenchmarkImp() {
@@ -419,11 +503,15 @@
   args_.emplace_back(x, -1);
 }
 
+void BenchmarkImp::Unit(TimeUnit unit) {
+  time_unit_ = unit;
+}
+
 void BenchmarkImp::Range(int start, int limit) {
   CHECK(arg_count_ == -1 || arg_count_ == 1);
   arg_count_ = 1;
   std::vector<int> arglist;
-  AddRange(&arglist, start, limit, kRangeMultiplier);
+  AddRange(&arglist, start, limit, range_multiplier_);
 
   for (int i : arglist) {
     args_.emplace_back(i, -1);
@@ -450,8 +538,8 @@
   CHECK(arg_count_ == -1 || arg_count_ == 2);
   arg_count_ = 2;
   std::vector<int> arglist1, arglist2;
-  AddRange(&arglist1, lo1, hi1, kRangeMultiplier);
-  AddRange(&arglist2, lo2, hi2, kRangeMultiplier);
+  AddRange(&arglist1, lo1, hi1, range_multiplier_);
+  AddRange(&arglist2, lo2, hi2, range_multiplier_);
 
   for (int i : arglist1) {
     for (int j : arglist2) {
@@ -460,15 +548,40 @@
   }
 }
 
+void BenchmarkImp::RangeMultiplier(int multiplier) {
+  CHECK(multiplier > 1);
+  range_multiplier_ = multiplier;
+}
+
 void BenchmarkImp::MinTime(double t) {
   CHECK(t > 0.0);
   min_time_ = t;
 }
 
+
+void BenchmarkImp::Repetitions(int n) {
+  CHECK(n > 0);
+  repetitions_ = n;
+}
+
 void BenchmarkImp::UseRealTime() {
+  CHECK(!use_manual_time_) << "Cannot set UseRealTime and UseManualTime simultaneously.";
   use_real_time_ = true;
 }
 
+void BenchmarkImp::UseManualTime() {
+  CHECK(!use_real_time_) << "Cannot set UseRealTime and UseManualTime simultaneously.";
+  use_manual_time_ = true;
+}
+
+void BenchmarkImp::Complexity(BigO complexity){
+  complexity_ = complexity;
+}
+
+void BenchmarkImp::ComplexityLambda(BigOFunc* complexity) {
+  complexity_lambda_ = complexity;
+}
+
 void BenchmarkImp::Threads(int t) {
   CHECK_GT(t, 0);
   thread_counts_.push_back(t);
@@ -493,6 +606,7 @@
 void BenchmarkImp::AddRange(std::vector<int>* dst, int lo, int hi, int mult) {
   CHECK_GE(lo, 0);
   CHECK_GE(hi, lo);
+  CHECK_GE(mult, 2);
 
   // Add "lo"
   dst->push_back(lo);
@@ -531,6 +645,11 @@
   return this;
 }
 
+Benchmark* Benchmark::Unit(TimeUnit unit) {
+  imp_->Unit(unit);
+  return this;
+}
+
 Benchmark* Benchmark::Range(int start, int limit) {
   imp_->Range(start, limit);
   return this;
@@ -556,6 +675,17 @@
   return this;
 }
 
+Benchmark* Benchmark::RangeMultiplier(int multiplier) {
+  imp_->RangeMultiplier(multiplier);
+  return this;
+}
+
+
+Benchmark* Benchmark::Repetitions(int t) {
+  imp_->Repetitions(t);
+  return this;
+}
+
 Benchmark* Benchmark::MinTime(double t) {
   imp_->MinTime(t);
   return this;
@@ -566,6 +696,22 @@
   return this;
 }
 
+Benchmark* Benchmark::UseManualTime() {
+  imp_->UseManualTime();
+  return this;
+}
+
+Benchmark* Benchmark::Complexity(BigO complexity) {
+  imp_->Complexity(complexity);
+  return this;
+}
+
+Benchmark* Benchmark::Complexity(BigOFunc* complexity) {
+  imp_->Complexity(oLambda);
+  imp_->ComplexityLambda(complexity);
+  return this;
+}
+
 Benchmark* Benchmark::Threads(int t) {
   imp_->Threads(t);
   return this;
@@ -593,7 +739,6 @@
 
 namespace {
 
-
 // Execute one thread of benchmark b for the specified number of iterations.
 // Adds the stats collected for the thread into *total.
 void RunInThread(const benchmark::internal::Benchmark::Instance* b,
@@ -607,13 +752,16 @@
     MutexLock l(GetBenchmarkLock());
     total->bytes_processed += st.bytes_processed();
     total->items_processed += st.items_processed();
+    total->complexity_n += st.complexity_length_n();
   }
 
   timer_manager->Finalize();
 }
 
 void RunBenchmark(const benchmark::internal::Benchmark::Instance& b,
-                  BenchmarkReporter* br) EXCLUDES(GetBenchmarkLock()) {
+                  BenchmarkReporter* br,
+                  std::vector<BenchmarkReporter::Run>& complexity_reports)
+  EXCLUDES(GetBenchmarkLock()) {
   size_t iters = 1;
 
   std::vector<BenchmarkReporter::Run> reports;
@@ -622,7 +770,9 @@
   if (b.multithreaded)
     pool.resize(b.threads);
 
-  for (int i = 0; i < FLAGS_benchmark_repetitions; i++) {
+  const int repeats = b.repetitions != 0 ? b.repetitions
+                                         : FLAGS_benchmark_repetitions;
+  for (int i = 0; i < repeats; i++) {
     std::string mem;
     for (;;) {
       // Try benchmark
@@ -632,6 +782,7 @@
         MutexLock l(GetBenchmarkLock());
         GetReportLabel()->clear();
       }
+      error_message = nullptr;
 
       Notification done;
       timer_manager = std::unique_ptr<TimerManager>(new TimerManager(b.threads, &done));
@@ -647,7 +798,7 @@
             thread.join();
         }
         for (std::size_t ti = 0; ti < pool.size(); ++ti) {
-            pool[ti] = std::thread(&RunInThread, &b, iters, ti, &total);
+            pool[ti] = std::thread(&RunInThread, &b, iters, static_cast<int>(ti), &total);
         }
       } else {
         // Run directly in this thread
@@ -658,6 +809,7 @@
 
       const double cpu_accumulated_time = timer_manager->cpu_time_used();
       const double real_accumulated_time = timer_manager->real_time_used();
+      const double manual_accumulated_time = timer_manager->manual_time_used();
       timer_manager.reset();
 
       VLOG(2) << "Ran in " << cpu_accumulated_time << "/"
@@ -665,7 +817,9 @@
 
       // Base decisions off of real time if requested by this benchmark.
       double seconds = cpu_accumulated_time;
-      if (b.use_real_time) {
+      if (b.use_manual_time) {
+          seconds = manual_accumulated_time;
+      } else if (b.use_real_time) {
           seconds = real_accumulated_time;
       }
 
@@ -674,35 +828,53 @@
         MutexLock l(GetBenchmarkLock());
         label = *GetReportLabel();
       }
+      error_message_type error_msg = error_message;
 
       const double min_time = !IsZero(b.min_time) ? b.min_time
                                                   : FLAGS_benchmark_min_time;
 
       // If this was the first run, was elapsed time or cpu time large enough?
       // If this is not the first run, go with the current value of iter.
-      if ((i > 0) ||
+      if ((i > 0) || (error_msg != nullptr) ||
           (iters >= kMaxIterations) ||
           (seconds >= min_time) ||
           (real_accumulated_time >= 5*min_time)) {
-        double bytes_per_second = 0;
-        if (total.bytes_processed > 0 && seconds > 0.0) {
-          bytes_per_second = (total.bytes_processed / seconds);
-        }
-        double items_per_second = 0;
-        if (total.items_processed > 0 && seconds > 0.0) {
-          items_per_second = (total.items_processed / seconds);
-        }
 
         // Create report about this benchmark run.
         BenchmarkReporter::Run report;
         report.benchmark_name = b.name;
+        report.error_occurred = error_msg != nullptr;
+        report.error_message = error_msg != nullptr ? error_msg : "";
         report.report_label = label;
         // Report the total iterations across all threads.
         report.iterations = static_cast<int64_t>(iters) * b.threads;
-        report.real_accumulated_time = real_accumulated_time;
-        report.cpu_accumulated_time = cpu_accumulated_time;
-        report.bytes_per_second = bytes_per_second;
-        report.items_per_second = items_per_second;
+        report.time_unit = b.time_unit;
+
+        if (!report.error_occurred) {
+          double bytes_per_second = 0;
+          if (total.bytes_processed > 0 && seconds > 0.0) {
+            bytes_per_second = (total.bytes_processed / seconds);
+          }
+          double items_per_second = 0;
+          if (total.items_processed > 0 && seconds > 0.0) {
+            items_per_second = (total.items_processed / seconds);
+          }
+
+          if (b.use_manual_time) {
+            report.real_accumulated_time = manual_accumulated_time;
+          } else {
+            report.real_accumulated_time = real_accumulated_time;
+          }
+          report.cpu_accumulated_time = cpu_accumulated_time;
+          report.bytes_per_second = bytes_per_second;
+          report.items_per_second = items_per_second;
+          report.complexity_n = total.complexity_n;
+          report.complexity = b.complexity;
+          report.complexity_lambda = b.complexity_lambda;
+          if(report.complexity != oNone)
+            complexity_reports.push_back(report);
+        }
+
         reports.push_back(report);
         break;
       }
@@ -726,7 +898,19 @@
       iters = static_cast<int>(next_iters + 0.5);
     }
   }
+  std::vector<BenchmarkReporter::Run> additional_run_stats = ComputeStats(reports);
+  reports.insert(reports.end(), additional_run_stats.begin(),
+                 additional_run_stats.end());
+
+  if((b.complexity != oNone) && b.last_benchmark_instance) {
+    additional_run_stats = ComputeBigO(complexity_reports);
+    reports.insert(reports.end(), additional_run_stats.begin(),
+                   additional_run_stats.end());
+    complexity_reports.clear();
+  }
+
   br->ReportRuns(reports);
+
   if (b.multithreaded) {
     for (std::thread& thread : pool)
       thread.join();
@@ -737,10 +921,12 @@
 
 State::State(size_t max_iters, bool has_x, int x, bool has_y, int y,
              int thread_i, int n_threads)
-    : started_(false), total_iterations_(0),
+    : started_(false), finished_(false), total_iterations_(0),
       has_range_x_(has_x), range_x_(x),
       has_range_y_(has_y), range_y_(y),
       bytes_processed_(0), items_processed_(0),
+      complexity_n_(0),
+      error_occurred_(false),
       thread_index(thread_i),
       threads(n_threads),
       max_iterations(max_iters)
@@ -752,14 +938,33 @@
 void State::PauseTiming() {
   // Add in time accumulated so far
   CHECK(running_benchmark);
+  CHECK(started_ && !finished_ && !error_occurred_);
   timer_manager->StopTimer();
 }
 
 void State::ResumeTiming() {
   CHECK(running_benchmark);
+  CHECK(started_ && !finished_ && !error_occurred_);
   timer_manager->StartTimer();
 }
 
+void State::SkipWithError(const char* msg) {
+  CHECK(msg);
+  error_occurred_ = true;
+  error_message_type expected_no_error_msg = nullptr;
+  error_message.compare_exchange_weak(expected_no_error_msg,
+    const_cast<error_message_type>(msg));
+  started_ = finished_ = true;
+  total_iterations_ = max_iterations;
+  timer_manager->RemoveErroredThread();
+}
+
+void State::SetIterationTime(double seconds)
+{
+  CHECK(running_benchmark);
+  timer_manager->SetIterationTime(seconds);
+}
+
 void State::SetLabel(const char* label) {
   CHECK(running_benchmark);
   MutexLock l(GetBenchmarkLock());
@@ -769,32 +974,19 @@
 namespace internal {
 namespace {
 
-void PrintBenchmarkList() {
-  std::vector<Benchmark::Instance> benchmarks;
-  auto families = BenchmarkFamilies::GetInstance();
-  if (!families->FindBenchmarks(".", &benchmarks)) return;
-
-  for (const internal::Benchmark::Instance& benchmark : benchmarks) {
-    std::cout <<  benchmark.name << "\n";
-  }
-}
-
-void RunMatchingBenchmarks(const std::string& spec,
+void RunMatchingBenchmarks(const std::vector<Benchmark::Instance>& benchmarks,
                            BenchmarkReporter* reporter) {
   CHECK(reporter != nullptr);
-  if (spec.empty()) return;
-
-  std::vector<Benchmark::Instance> benchmarks;
-  auto families = BenchmarkFamilies::GetInstance();
-  if (!families->FindBenchmarks(spec, &benchmarks)) return;
 
   // Determine the width of the name field using a minimum width of 10.
+  bool has_repetitions = FLAGS_benchmark_repetitions > 1;
   size_t name_field_width = 10;
   for (const Benchmark::Instance& benchmark : benchmarks) {
     name_field_width =
         std::max<size_t>(name_field_width, benchmark.name.size());
+    has_repetitions |= benchmark.repetitions > 1;
   }
-  if (FLAGS_benchmark_repetitions > 1)
+  if (has_repetitions)
     name_field_width += std::strlen("_stddev");
 
   // Print header here
@@ -805,16 +997,19 @@
   context.cpu_scaling_enabled = CpuScalingEnabled();
   context.name_field_width = name_field_width;
 
+  // Keep track of runing times of all instances of current benchmark
+  std::vector<BenchmarkReporter::Run> complexity_reports;
+
   if (reporter->ReportContext(context)) {
     for (const auto& benchmark : benchmarks) {
-      RunBenchmark(benchmark, reporter);
+      RunBenchmark(benchmark, reporter, complexity_reports);
     }
   }
 }
 
 std::unique_ptr<BenchmarkReporter> GetDefaultReporter() {
   typedef std::unique_ptr<BenchmarkReporter> PtrType;
-  if (FLAGS_benchmark_format == "tabular") {
+  if (FLAGS_benchmark_format == "console") {
     return PtrType(new ConsoleReporter);
   } else if (FLAGS_benchmark_format == "json") {
     return PtrType(new JSONReporter);
@@ -829,26 +1024,32 @@
 } // end namespace
 } // end namespace internal
 
-void RunSpecifiedBenchmarks() {
-  RunSpecifiedBenchmarks(nullptr);
+size_t RunSpecifiedBenchmarks() {
+  return RunSpecifiedBenchmarks(nullptr);
 }
 
-void RunSpecifiedBenchmarks(BenchmarkReporter* reporter) {
-  if (FLAGS_benchmark_list_tests) {
-    internal::PrintBenchmarkList();
-    return;
-  }
+size_t RunSpecifiedBenchmarks(BenchmarkReporter* reporter) {
   std::string spec = FLAGS_benchmark_filter;
   if (spec.empty() || spec == "all")
     spec = ".";  // Regexp that matches all benchmarks
 
-  std::unique_ptr<BenchmarkReporter> default_reporter;
-  if (!reporter) {
-    default_reporter = internal::GetDefaultReporter();
-    reporter = default_reporter.get();
+  std::vector<internal::Benchmark::Instance> benchmarks;
+  auto families = internal::BenchmarkFamilies::GetInstance();
+  if (!families->FindBenchmarks(spec, &benchmarks)) return 0;
+
+  if (FLAGS_benchmark_list_tests) {
+    for (auto const& benchmark : benchmarks)
+      std::cout <<  benchmark.name << "\n";
+  } else {
+    std::unique_ptr<BenchmarkReporter> default_reporter;
+    if (!reporter) {
+      default_reporter = internal::GetDefaultReporter();
+      reporter = default_reporter.get();
+    }
+    internal::RunMatchingBenchmarks(benchmarks, reporter);
+    reporter->Finalize();
   }
-  internal::RunMatchingBenchmarks(spec, reporter);
-  reporter->Finalize();
+  return benchmarks.size();
 }
 
 namespace internal {
@@ -860,7 +1061,7 @@
           "          [--benchmark_filter=<regex>]\n"
           "          [--benchmark_min_time=<min_time>]\n"
           "          [--benchmark_repetitions=<num_repetitions>]\n"
-          "          [--benchmark_format=<tabular|json|csv>]\n"
+          "          [--benchmark_format=<console|json|csv>]\n"
           "          [--color_print={true|false}]\n"
           "          [--v=<verbosity>]\n");
   exit(0);
@@ -891,7 +1092,8 @@
       PrintUsageAndExit();
     }
   }
-  if (FLAGS_benchmark_format != "tabular" &&
+
+  if (FLAGS_benchmark_format != "console" &&
       FLAGS_benchmark_format != "json" &&
       FLAGS_benchmark_format != "csv") {
     PrintUsageAndExit();
diff --git a/src/check.h b/src/check.h
index d2c1fda..4572bab 100644
--- a/src/check.h
+++ b/src/check.h
@@ -10,6 +10,18 @@
 namespace benchmark {
 namespace internal {
 
+typedef void(AbortHandlerT)();
+
+inline AbortHandlerT*& GetAbortHandler() {
+    static AbortHandlerT* handler = &std::abort;
+    return handler;
+}
+
+BENCHMARK_NORETURN inline void CallAbortHandler() {
+    GetAbortHandler()();
+    std::abort(); // fallback to enforce noreturn
+}
+
 // CheckHandler is the class constructed by failing CHECK macros. CheckHandler
 // will log information about the failures and abort when it is destructed.
 class CheckHandler {
@@ -25,13 +37,13 @@
     return log_;
   }
 
-  BENCHMARK_NORETURN ~CheckHandler() {
+  BENCHMARK_NORETURN ~CheckHandler() BENCHMARK_NOEXCEPT_OP(false) {
       log_ << std::endl;
-      std::abort();
+      CallAbortHandler();
   }
 
-  CheckHandler & operator=(const CheckHandler&) = delete;

-  CheckHandler(const CheckHandler&) = delete;

+  CheckHandler & operator=(const CheckHandler&) = delete;
+  CheckHandler(const CheckHandler&) = delete;
   CheckHandler() = delete;
 private:
   std::ostream& log_;
diff --git a/src/colorprint.cc b/src/colorprint.cc
index 81f917b..efb8626 100644
--- a/src/colorprint.cc
+++ b/src/colorprint.cc
@@ -16,8 +16,12 @@
 
 #include <cstdarg>
 #include <cstdio>
+#include <cstdarg>
+#include <string>
+#include <memory>
 
 #include "commandlineflags.h"
+#include "check.h"
 #include "internal_macros.h"
 
 #ifdef BENCHMARK_OS_WINDOWS
@@ -74,14 +78,51 @@
   };
 #endif
 }
+
 }  // end namespace
 
-void ColorPrintf(LogColor color, const char* fmt, ...) {
+std::string FormatString(const char *msg, va_list args) {
+  // we might need a second shot at this, so pre-emptivly make a copy
+  va_list args_cp;
+  va_copy(args_cp, args);
+
+  std::size_t size = 256;
+  char local_buff[256];
+  auto ret = std::vsnprintf(local_buff, size, msg, args_cp);
+
+  va_end(args_cp);
+
+  // currently there is no error handling for failure, so this is hack.
+  CHECK(ret >= 0);
+
+  if (ret == 0) // handle empty expansion
+    return {};
+  else if (static_cast<size_t>(ret) < size)
+    return local_buff;
+  else {
+    // we did not provide a long enough buffer on our first attempt.
+    size = (size_t)ret + 1; // + 1 for the null byte
+    std::unique_ptr<char[]> buff(new char[size]);
+    ret = std::vsnprintf(buff.get(), size, msg, args);
+    CHECK(ret > 0 && ((size_t)ret) < size);
+    return buff.get();
+  }
+}
+
+std::string FormatString(const char *msg, ...) {
+  va_list args;
+  va_start(args, msg);
+  auto tmp = FormatString(msg, args);
+  va_end(args);
+  return tmp;
+}
+
+void ColorPrintf(std::ostream& out, LogColor color, const char* fmt, ...) {
   va_list args;
   va_start(args, fmt);
 
   if (!FLAGS_color_print) {
-    vprintf(fmt, args);
+    out << FormatString(fmt, args);
     va_end(args);
     return;
   }
@@ -107,10 +148,11 @@
   SetConsoleTextAttribute(stdout_handle, old_color_attrs);
 #else
   const char* color_code = GetPlatformColorCode(color);
-  if (color_code) fprintf(stdout, "\033[0;3%sm", color_code);
-  vprintf(fmt, args);
-  printf("\033[m");  // Resets the terminal to default.
+  if (color_code) out << FormatString("\033[0;3%sm", color_code);
+  out << FormatString(fmt, args) << "\033[m";
 #endif
+
   va_end(args);
 }
+
 }  // end namespace benchmark
diff --git a/src/colorprint.h b/src/colorprint.h
index 54d1f66..2b3c082 100644
--- a/src/colorprint.h
+++ b/src/colorprint.h
@@ -1,6 +1,10 @@
 #ifndef BENCHMARK_COLORPRINT_H_
 #define BENCHMARK_COLORPRINT_H_
 
+#include <cstdarg>
+#include <string>
+#include <iostream>
+
 namespace benchmark {
 enum LogColor {
   COLOR_DEFAULT,
@@ -13,7 +17,11 @@
   COLOR_WHITE
 };
 
-void ColorPrintf(LogColor color, const char* fmt, ...);
+std::string FormatString(const char* msg, va_list args);
+std::string FormatString(const char* msg, ...);
+
+void ColorPrintf(std::ostream& out, LogColor color, const char* fmt, ...);
+
 }  // end namespace benchmark
 
 #endif  // BENCHMARK_COLORPRINT_H_
diff --git a/src/complexity.cc b/src/complexity.cc
new file mode 100644
index 0000000..b42bd38
--- /dev/null
+++ b/src/complexity.cc
@@ -0,0 +1,283 @@
+// Copyright 2016 Ismael Jimenez Martinez. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Source project : https://github.com/ismaelJimenez/cpp.leastsq
+// Adapted to be used with google benchmark
+
+#include "benchmark/benchmark_api.h"
+
+#include <algorithm>
+#include <cmath>
+#include "check.h"
+#include "complexity.h"
+#include "stat.h"
+
+namespace benchmark {
+
+// Internal function to calculate the different scalability forms
+BigOFunc* FittingCurve(BigO complexity) {
+  switch (complexity) {
+    case oN:
+      return [](int n) -> double { return n; };
+    case oNSquared:
+      return [](int n) -> double { return n * n; };
+    case oNCubed:
+      return [](int n) -> double { return n * n * n; };
+    case oLogN:
+      return [](int n) { return std::log2(n); };
+    case oNLogN:
+      return [](int n) { return n * std::log2(n); };
+    case o1:
+    default:
+      return [](int) { return 1.0; };
+  }
+}
+
+// Function to return an string for the calculated complexity
+std::string GetBigOString(BigO complexity) {
+  switch (complexity) {
+    case oN:
+      return "N";
+    case oNSquared:
+      return "N^2";
+    case oNCubed:
+      return "N^3";
+    case oLogN:
+      return "lgN";
+    case oNLogN:
+      return "NlgN";
+    case o1:
+      return "(1)";
+    default:
+      return "f(N)";
+  }
+}
+
+// Find the coefficient for the high-order term in the running time, by
+// minimizing the sum of squares of relative error, for the fitting curve
+// given by the lambda expresion.
+//   - n             : Vector containing the size of the benchmark tests.
+//   - time          : Vector containing the times for the benchmark tests.
+//   - fitting_curve : lambda expresion (e.g. [](int n) {return n; };).
+
+// For a deeper explanation on the algorithm logic, look the README file at
+// http://github.com/ismaelJimenez/Minimal-Cpp-Least-Squared-Fit
+
+LeastSq MinimalLeastSq(const std::vector<int>& n,
+                       const std::vector<double>& time,
+                       BigOFunc* fitting_curve) {
+  double sigma_gn = 0.0;
+  double sigma_gn_squared = 0.0;
+  double sigma_time = 0.0;
+  double sigma_time_gn = 0.0;
+
+  // Calculate least square fitting parameter
+  for (size_t i = 0; i < n.size(); ++i) {
+    double gn_i = fitting_curve(n[i]);
+    sigma_gn += gn_i;
+    sigma_gn_squared += gn_i * gn_i;
+    sigma_time += time[i];
+    sigma_time_gn += time[i] * gn_i;
+  }
+
+  LeastSq result;
+  result.complexity = oLambda;
+
+  // Calculate complexity.
+  result.coef = sigma_time_gn / sigma_gn_squared;
+
+  // Calculate RMS
+  double rms = 0.0;
+  for (size_t i = 0; i < n.size(); ++i) {
+    double fit = result.coef * fitting_curve(n[i]);
+    rms += pow((time[i] - fit), 2);
+  }
+
+  // Normalized RMS by the mean of the observed values
+  double mean = sigma_time / n.size();
+  result.rms = sqrt(rms / n.size()) / mean;
+
+  return result;
+}
+
+// Find the coefficient for the high-order term in the running time, by
+// minimizing the sum of squares of relative error.
+//   - n          : Vector containing the size of the benchmark tests.
+//   - time       : Vector containing the times for the benchmark tests.
+//   - complexity : If different than oAuto, the fitting curve will stick to
+//                  this one. If it is oAuto, it will be calculated the best
+//                  fitting curve.
+LeastSq MinimalLeastSq(const std::vector<int>& n,
+                       const std::vector<double>& time,
+                       const BigO complexity) {
+  CHECK_EQ(n.size(), time.size());
+  CHECK_GE(n.size(), 2);  // Do not compute fitting curve is less than two
+                          // benchmark runs are given
+  CHECK_NE(complexity, oNone);
+
+  LeastSq best_fit;
+
+  if (complexity == oAuto) {
+    std::vector<BigO> fit_curves = {oLogN, oN, oNLogN, oNSquared, oNCubed};
+
+    // Take o1 as default best fitting curve
+    best_fit = MinimalLeastSq(n, time, FittingCurve(o1));
+    best_fit.complexity = o1;
+
+    // Compute all possible fitting curves and stick to the best one
+    for (const auto& fit : fit_curves) {
+      LeastSq current_fit = MinimalLeastSq(n, time, FittingCurve(fit));
+      if (current_fit.rms < best_fit.rms) {
+        best_fit = current_fit;
+        best_fit.complexity = fit;
+      }
+    }
+  } else {
+    best_fit = MinimalLeastSq(n, time, FittingCurve(complexity));
+    best_fit.complexity = complexity;
+  }
+
+  return best_fit;
+}
+
+std::vector<BenchmarkReporter::Run> ComputeStats(
+    const std::vector<BenchmarkReporter::Run>& reports) {
+  typedef BenchmarkReporter::Run Run;
+  std::vector<Run> results;
+
+  auto error_count =
+      std::count_if(reports.begin(), reports.end(),
+                    [](Run const& run) { return run.error_occurred; });
+
+  if (reports.size() - error_count < 2) {
+    // We don't report aggregated data if there was a single run.
+    return results;
+  }
+  // Accumulators.
+  Stat1_d real_accumulated_time_stat;
+  Stat1_d cpu_accumulated_time_stat;
+  Stat1_d bytes_per_second_stat;
+  Stat1_d items_per_second_stat;
+  // All repetitions should be run with the same number of iterations so we
+  // can take this information from the first benchmark.
+  int64_t const run_iterations = reports.front().iterations;
+
+  // Populate the accumulators.
+  for (Run const& run : reports) {
+    CHECK_EQ(reports[0].benchmark_name, run.benchmark_name);
+    CHECK_EQ(run_iterations, run.iterations);
+    if (run.error_occurred) continue;
+    real_accumulated_time_stat +=
+        Stat1_d(run.real_accumulated_time / run.iterations, run.iterations);
+    cpu_accumulated_time_stat +=
+        Stat1_d(run.cpu_accumulated_time / run.iterations, run.iterations);
+    items_per_second_stat += Stat1_d(run.items_per_second, run.iterations);
+    bytes_per_second_stat += Stat1_d(run.bytes_per_second, run.iterations);
+  }
+
+  // Get the data from the accumulator to BenchmarkReporter::Run's.
+  Run mean_data;
+  mean_data.benchmark_name = reports[0].benchmark_name + "_mean";
+  mean_data.iterations = run_iterations;
+  mean_data.real_accumulated_time =
+      real_accumulated_time_stat.Mean() * run_iterations;
+  mean_data.cpu_accumulated_time =
+      cpu_accumulated_time_stat.Mean() * run_iterations;
+  mean_data.bytes_per_second = bytes_per_second_stat.Mean();
+  mean_data.items_per_second = items_per_second_stat.Mean();
+
+  // Only add label to mean/stddev if it is same for all runs
+  mean_data.report_label = reports[0].report_label;
+  for (std::size_t i = 1; i < reports.size(); i++) {
+    if (reports[i].report_label != reports[0].report_label) {
+      mean_data.report_label = "";
+      break;
+    }
+  }
+
+  Run stddev_data;
+  stddev_data.benchmark_name = reports[0].benchmark_name + "_stddev";
+  stddev_data.report_label = mean_data.report_label;
+  stddev_data.iterations = 0;
+  stddev_data.real_accumulated_time = real_accumulated_time_stat.StdDev();
+  stddev_data.cpu_accumulated_time = cpu_accumulated_time_stat.StdDev();
+  stddev_data.bytes_per_second = bytes_per_second_stat.StdDev();
+  stddev_data.items_per_second = items_per_second_stat.StdDev();
+
+  results.push_back(mean_data);
+  results.push_back(stddev_data);
+  return results;
+}
+
+std::vector<BenchmarkReporter::Run> ComputeBigO(
+    const std::vector<BenchmarkReporter::Run>& reports) {
+  typedef BenchmarkReporter::Run Run;
+  std::vector<Run> results;
+
+  if (reports.size() < 2) return results;
+
+  // Accumulators.
+  std::vector<int> n;
+  std::vector<double> real_time;
+  std::vector<double> cpu_time;
+
+  // Populate the accumulators.
+  for (const Run& run : reports) {
+    CHECK_GT(run.complexity_n, 0) << "Did you forget to call SetComplexityN?";
+    n.push_back(run.complexity_n);
+    real_time.push_back(run.real_accumulated_time / run.iterations);
+    cpu_time.push_back(run.cpu_accumulated_time / run.iterations);
+  }
+
+  LeastSq result_cpu;
+  LeastSq result_real;
+
+  if (reports[0].complexity == oLambda) {
+    result_cpu = MinimalLeastSq(n, cpu_time, reports[0].complexity_lambda);
+    result_real = MinimalLeastSq(n, real_time, reports[0].complexity_lambda);
+  } else {
+    result_cpu = MinimalLeastSq(n, cpu_time, reports[0].complexity);
+    result_real = MinimalLeastSq(n, real_time, result_cpu.complexity);
+  }
+  std::string benchmark_name =
+      reports[0].benchmark_name.substr(0, reports[0].benchmark_name.find('/'));
+
+  // Get the data from the accumulator to BenchmarkReporter::Run's.
+  Run big_o;
+  big_o.benchmark_name = benchmark_name + "_BigO";
+  big_o.iterations = 0;
+  big_o.real_accumulated_time = result_real.coef;
+  big_o.cpu_accumulated_time = result_cpu.coef;
+  big_o.report_big_o = true;
+  big_o.complexity = result_cpu.complexity;
+
+  double multiplier = GetTimeUnitMultiplier(reports[0].time_unit);
+
+  // Only add label to mean/stddev if it is same for all runs
+  Run rms;
+  big_o.report_label = reports[0].report_label;
+  rms.benchmark_name = benchmark_name + "_RMS";
+  rms.report_label = big_o.report_label;
+  rms.iterations = 0;
+  rms.real_accumulated_time = result_real.rms / multiplier;
+  rms.cpu_accumulated_time = result_cpu.rms / multiplier;
+  rms.report_rms = true;
+  rms.complexity = result_cpu.complexity;
+
+  results.push_back(big_o);
+  results.push_back(rms);
+  return results;
+}
+
+}  // end namespace benchmark
diff --git a/src/complexity.h b/src/complexity.h
new file mode 100644
index 0000000..85cc125
--- /dev/null
+++ b/src/complexity.h
@@ -0,0 +1,64 @@
+// Copyright 2016 Ismael Jimenez Martinez. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Source project : https://github.com/ismaelJimenez/cpp.leastsq
+// Adapted to be used with google benchmark
+
+#ifndef COMPLEXITY_H_
+#define COMPLEXITY_H_
+
+#include <string>
+#include <vector>
+
+#include "benchmark/benchmark_api.h"
+#include "benchmark/reporter.h"
+
+namespace benchmark {
+
+// Return a vector containing the mean and standard devation information for
+// the specified list of reports. If 'reports' contains less than two
+// non-errored runs an empty vector is returned
+std::vector<BenchmarkReporter::Run> ComputeStats(
+    const std::vector<BenchmarkReporter::Run>& reports);
+
+// Return a vector containing the bigO and RMS information for the specified
+// list of reports. If 'reports.size() < 2' an empty vector is returned.
+std::vector<BenchmarkReporter::Run> ComputeBigO(
+    const std::vector<BenchmarkReporter::Run>& reports);
+
+// This data structure will contain the result returned by MinimalLeastSq
+//   - coef        : Estimated coeficient for the high-order term as
+//                   interpolated from data.
+//   - rms         : Normalized Root Mean Squared Error.
+//   - complexity  : Scalability form (e.g. oN, oNLogN). In case a scalability
+//                   form has been provided to MinimalLeastSq this will return
+//                   the same value. In case BigO::oAuto has been selected, this
+//                   parameter will return the best fitting curve detected.
+
+struct LeastSq {
+  LeastSq() :
+    coef(0.0),
+    rms(0.0),
+    complexity(oNone) {}
+
+  double coef;
+  double rms;
+  BigO complexity;
+};
+
+// Function to return an string for the calculated complexity
+std::string GetBigOString(BigO complexity);
+
+} // end namespace benchmark
+#endif // COMPLEXITY_H_
diff --git a/src/console_reporter.cc b/src/console_reporter.cc
index 092936d..080c324 100644
--- a/src/console_reporter.cc
+++ b/src/console_reporter.cc
@@ -13,72 +13,66 @@
 // limitations under the License.
 
 #include "benchmark/reporter.h"
+#include "complexity.h"
 
+#include <algorithm>
 #include <cstdint>
 #include <cstdio>
 #include <iostream>
 #include <string>
+#include <tuple>
 #include <vector>
 
 #include "check.h"
 #include "colorprint.h"
+#include "commandlineflags.h"
+#include "internal_macros.h"
 #include "string_util.h"
 #include "walltime.h"
 
+DECLARE_bool(color_print);
+
 namespace benchmark {
 
 bool ConsoleReporter::ReportContext(const Context& context) {
   name_field_width_ = context.name_field_width;
 
-  std::cerr << "Run on (" << context.num_cpus << " X " << context.mhz_per_cpu
-            << " MHz CPU " << ((context.num_cpus > 1) ? "s" : "") << ")\n";
+  PrintBasicContext(&GetErrorStream(), context);
 
-  std::cerr << LocalDateTimeString() << "\n";
-
-  if (context.cpu_scaling_enabled) {
-    std::cerr << "***WARNING*** CPU scaling is enabled, the benchmark "
-                 "real time measurements may be noisy and will incur extra "
-                 "overhead.\n";
+#ifdef BENCHMARK_OS_WINDOWS
+  if (FLAGS_color_print && &std::cout != &GetOutputStream()) {
+      GetErrorStream() << "Color printing is only supported for stdout on windows."
+                          " Disabling color printing\n";
+      FLAGS_color_print = false;
   }
-
-#ifndef NDEBUG
-  std::cerr << "***WARNING*** Library was built as DEBUG. Timings may be "
-               "affected.\n";
 #endif
-
-  int output_width = fprintf(stdout, "%-*s %10s %10s %10s\n",
+  std::string str = FormatString("%-*s %13s %13s %10s\n",
                              static_cast<int>(name_field_width_), "Benchmark",
-                             "Time(ns)", "CPU(ns)", "Iterations");
-  std::cout << std::string(output_width - 1, '-') << "\n";
+                             "Time", "CPU", "Iterations");
+  GetOutputStream() << str << std::string(str.length() - 1, '-') << "\n";
 
   return true;
 }
 
 void ConsoleReporter::ReportRuns(const std::vector<Run>& reports) {
-  if (reports.empty()) {
-    return;
-  }
-
-  for (Run const& run : reports) {
-    CHECK_EQ(reports[0].benchmark_name, run.benchmark_name);
+  for (const auto& run : reports)
     PrintRunData(run);
-  }
-
-  if (reports.size() < 2) {
-    // We don't report aggregated data if there was a single run.
-    return;
-  }
-
-  Run mean_data;
-  Run stddev_data;
-  BenchmarkReporter::ComputeStats(reports, &mean_data, &stddev_data);
-
-  // Output using PrintRun.
-  PrintRunData(mean_data);
-  PrintRunData(stddev_data);
 }
 
 void ConsoleReporter::PrintRunData(const Run& result) {
+  auto& Out = GetOutputStream();
+
+  auto name_color =
+      (result.report_big_o || result.report_rms) ? COLOR_BLUE : COLOR_GREEN;
+  ColorPrintf(Out, name_color, "%-*s ", name_field_width_,
+              result.benchmark_name.c_str());
+
+  if (result.error_occurred) {
+    ColorPrintf(Out, COLOR_RED, "ERROR OCCURRED: \'%s\'",
+                result.error_message.c_str());
+    ColorPrintf(Out, COLOR_DEFAULT, "\n");
+    return;
+  }
   // Format bytes per second
   std::string rate;
   if (result.bytes_per_second > 0) {
@@ -90,27 +84,41 @@
   if (result.items_per_second > 0) {
     items = StrCat(" ", HumanReadableNumber(result.items_per_second),
                    " items/s");
+ }
+
+  const double real_time = result.GetAdjustedRealTime();
+  const double cpu_time = result.GetAdjustedCPUTime();
+
+  if (result.report_big_o) {
+    std::string big_o = GetBigOString(result.complexity);
+    ColorPrintf(Out, COLOR_YELLOW, "%10.2f %s %10.2f %s ", real_time,
+                big_o.c_str(), cpu_time, big_o.c_str());
+  } else if (result.report_rms) {
+    ColorPrintf(Out, COLOR_YELLOW, "%10.0f %% %10.0f %% ", real_time * 100,
+                cpu_time * 100);
+  } else {
+    const char* timeLabel = GetTimeUnitString(result.time_unit);
+    ColorPrintf(Out, COLOR_YELLOW, "%10.0f %s %10.0f %s ", real_time, timeLabel,
+                cpu_time, timeLabel);
   }
 
-  double const multiplier = 1e9; // nano second multiplier
-  ColorPrintf(COLOR_GREEN, "%-*s ",
-              name_field_width_, result.benchmark_name.c_str());
-  if (result.iterations == 0) {
-    ColorPrintf(COLOR_YELLOW, "%10.0f %10.0f ",
-                result.real_accumulated_time * multiplier,
-                result.cpu_accumulated_time * multiplier);
-  } else {
-    ColorPrintf(COLOR_YELLOW, "%10.0f %10.0f ",
-                (result.real_accumulated_time * multiplier) /
-                    (static_cast<double>(result.iterations)),
-                (result.cpu_accumulated_time * multiplier) /
-                    (static_cast<double>(result.iterations)));
+  if (!result.report_big_o && !result.report_rms) {
+    ColorPrintf(Out, COLOR_CYAN, "%10lld", result.iterations);
   }
-  ColorPrintf(COLOR_CYAN, "%10lld", result.iterations);
-  ColorPrintf(COLOR_DEFAULT, "%*s %*s %s\n",
-              13, rate.c_str(),
-              18, items.c_str(),
-              result.report_label.c_str());
+
+  if (!rate.empty()) {
+    ColorPrintf(Out, COLOR_DEFAULT, " %*s", 13, rate.c_str());
+  }
+
+  if (!items.empty()) {
+    ColorPrintf(Out, COLOR_DEFAULT, " %*s", 18, items.c_str());
+  }
+
+  if (!result.report_label.empty()) {
+    ColorPrintf(Out, COLOR_DEFAULT, " %s", result.report_label.c_str());
+  }
+
+  ColorPrintf(Out, COLOR_DEFAULT, "\n");
 }
 
 }  // end namespace benchmark
diff --git a/src/csv_reporter.cc b/src/csv_reporter.cc
index d78a9df..7bc7ef3 100644
--- a/src/csv_reporter.cc
+++ b/src/csv_reporter.cc
@@ -13,10 +13,13 @@
 // limitations under the License.
 
 #include "benchmark/reporter.h"
+#include "complexity.h"
 
+#include <algorithm>
 #include <cstdint>
 #include <iostream>
 #include <string>
+#include <tuple>
 #include <vector>
 
 #include "string_util.h"
@@ -26,80 +29,90 @@
 
 namespace benchmark {
 
+namespace {
+std::vector<std::string> elements = {
+  "name",
+  "iterations",
+  "real_time",
+  "cpu_time",
+  "time_unit",
+  "bytes_per_second",
+  "items_per_second",
+  "label",
+  "error_occurred",
+  "error_message"
+};
+}
+
 bool CSVReporter::ReportContext(const Context& context) {
-  std::cerr << "Run on (" << context.num_cpus << " X " << context.mhz_per_cpu
-            << " MHz CPU " << ((context.num_cpus > 1) ? "s" : "") << ")\n";
+  PrintBasicContext(&GetErrorStream(), context);
 
-  std::cerr << LocalDateTimeString() << "\n";
-
-  if (context.cpu_scaling_enabled) {
-    std::cerr << "***WARNING*** CPU scaling is enabled, the benchmark "
-                 "real time measurements may be noisy and will incur extra "
-                 "overhead.\n";
+  std::ostream& Out = GetOutputStream();
+  for (auto B = elements.begin(); B != elements.end(); ) {
+    Out << *B++;
+    if (B != elements.end())
+      Out << ",";
   }
-
-#ifndef NDEBUG
-  std::cerr << "***WARNING*** Library was built as DEBUG. Timings may be "
-               "affected.\n";
-#endif
-  std::cout << "name,iterations,real_time,cpu_time,bytes_per_second,"
-               "items_per_second,label\n";
+  Out << "\n";
   return true;
 }
 
-void CSVReporter::ReportRuns(std::vector<Run> const& reports) {
-  if (reports.empty()) {
-    return;
-  }
-
-  std::vector<Run> reports_cp = reports;
-  if (reports.size() >= 2) {
-    Run mean_data;
-    Run stddev_data;
-    BenchmarkReporter::ComputeStats(reports, &mean_data, &stddev_data);
-    reports_cp.push_back(mean_data);
-    reports_cp.push_back(stddev_data);
-  }
-  for (auto it = reports_cp.begin(); it != reports_cp.end(); ++it) {
-    PrintRunData(*it);
-  }
+void CSVReporter::ReportRuns(const std::vector<Run> & reports) {
+  for (const auto& run : reports)
+    PrintRunData(run);
 }
 
-void CSVReporter::PrintRunData(Run const& run) {
-  double const multiplier = 1e9;  // nano second multiplier
-  double cpu_time = run.cpu_accumulated_time * multiplier;
-  double real_time = run.real_accumulated_time * multiplier;
-  if (run.iterations != 0) {
-    real_time = real_time / static_cast<double>(run.iterations);
-    cpu_time = cpu_time / static_cast<double>(run.iterations);
-  }
+void CSVReporter::PrintRunData(const Run & run) {
+  std::ostream& Out = GetOutputStream();
 
   // Field with embedded double-quote characters must be doubled and the field
   // delimited with double-quotes.
   std::string name = run.benchmark_name;
   ReplaceAll(&name, "\"", "\"\"");
-  std::cout << "\"" << name << "\",";
+  Out << '"' << name << "\",";
+  if (run.error_occurred) {
+    Out << std::string(elements.size() - 3, ',');
+    Out << "true,";
+    std::string msg = run.error_message;
+    ReplaceAll(&msg, "\"", "\"\"");
+    Out << '"' << msg << "\"\n";
+    return;
+  }
 
-  std::cout << run.iterations << ",";
-  std::cout << real_time << ",";
-  std::cout << cpu_time << ",";
+  // Do not print iteration on bigO and RMS report
+  if (!run.report_big_o && !run.report_rms) {
+    Out << run.iterations;
+  }
+  Out << ",";
+
+  Out << run.GetAdjustedRealTime() << ",";
+  Out << run.GetAdjustedCPUTime() << ",";
+
+  // Do not print timeLabel on bigO and RMS report
+  if (run.report_big_o) {
+    Out << GetBigOString(run.complexity);
+  } else if (!run.report_rms) {
+    Out << GetTimeUnitString(run.time_unit);
+  }
+  Out << ",";
 
   if (run.bytes_per_second > 0.0) {
-    std::cout << run.bytes_per_second;
+    Out << run.bytes_per_second;
   }
-  std::cout << ",";
+  Out << ",";
   if (run.items_per_second > 0.0) {
-    std::cout << run.items_per_second;
+    Out << run.items_per_second;
   }
-  std::cout << ",";
+  Out << ",";
   if (!run.report_label.empty()) {
     // Field with embedded double-quote characters must be doubled and the field
     // delimited with double-quotes.
     std::string label = run.report_label;
     ReplaceAll(&label, "\"", "\"\"");
-    std::cout << "\"" << label << "\"";
+    Out << "\"" << label << "\"";
   }
-  std::cout << '\n';
+  Out << ",,";  // for error_occurred and error_message
+  Out << '\n';
 }
 
 }  // end namespace benchmark
diff --git a/src/cycleclock.h b/src/cycleclock.h
index 3110804..e4825d4 100644
--- a/src/cycleclock.h
+++ b/src/cycleclock.h
@@ -113,11 +113,11 @@
   uint32_t pmuseren;
   uint32_t pmcntenset;
   // Read the user mode perf monitor counter access permissions.
-  asm("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren));
+  asm volatile("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren));
   if (pmuseren & 1) {  // Allows reading perfmon counters for user mode code.
-    asm("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset));
+    asm volatile("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset));
     if (pmcntenset & 0x80000000ul) {  // Is it counting?
-      asm("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr));
+      asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr));
       // The counter is set up to count every 64th cycle
       return static_cast<int64_t>(pmccntr) * 64;  // Should optimize to << 6
     }
diff --git a/src/json_reporter.cc b/src/json_reporter.cc
index def50ac..485d305 100644
--- a/src/json_reporter.cc
+++ b/src/json_reporter.cc
@@ -13,10 +13,13 @@
 // limitations under the License.
 
 #include "benchmark/reporter.h"
+#include "complexity.h"
 
+#include <algorithm>
 #include <cstdint>
 #include <iostream>
 #include <string>
+#include <tuple>
 #include <vector>
 
 #include "string_util.h"
@@ -51,7 +54,7 @@
 } // end namespace
 
 bool JSONReporter::ReportContext(const Context& context) {
-  std::ostream& out = std::cout;
+  std::ostream& out = GetOutputStream();
 
   out << "{\n";
   std::string inner_indent(2, ' ');
@@ -90,70 +93,86 @@
     return;
   }
   std::string indent(4, ' ');
-  std::ostream& out = std::cout;
+  std::ostream& out = GetOutputStream();
   if (!first_report_) {
     out << ",\n";
   }
   first_report_ = false;
-  std::vector<Run> reports_cp = reports;
-  if (reports.size() >= 2) {
-    Run mean_data;
-    Run stddev_data;
-    BenchmarkReporter::ComputeStats(reports, &mean_data, &stddev_data);
-    reports_cp.push_back(mean_data);
-    reports_cp.push_back(stddev_data);
-  }
-  for (auto it = reports_cp.begin(); it != reports_cp.end(); ++it) {
-     out << indent << "{\n";
-     PrintRunData(*it);
-     out << indent << '}';
-     auto it_cp = it;
-     if (++it_cp != reports_cp.end()) {
-         out << ",\n";
-     }
+
+  for (auto it = reports.begin(); it != reports.end(); ++it) {
+    out << indent << "{\n";
+    PrintRunData(*it);
+    out << indent << '}';
+    auto it_cp = it;
+    if (++it_cp != reports.end()) {
+      out << ",\n";
+    }
   }
 }
 
 void JSONReporter::Finalize() {
-    // Close the list of benchmarks and the top level object.
-    std::cout << "\n  ]\n}\n";
+  // Close the list of benchmarks and the top level object.
+  GetOutputStream() << "\n  ]\n}\n";
 }
 
 void JSONReporter::PrintRunData(Run const& run) {
-    double const multiplier = 1e9; // nano second multiplier
-    double cpu_time = run.cpu_accumulated_time * multiplier;
-    double real_time = run.real_accumulated_time * multiplier;
-    if (run.iterations != 0) {
-        real_time = real_time / static_cast<double>(run.iterations);
-        cpu_time = cpu_time / static_cast<double>(run.iterations);
-    }
-
-    std::string indent(6, ' ');
-    std::ostream& out = std::cout;
+  std::string indent(6, ' ');
+  std::ostream& out = GetOutputStream();
     out << indent
         << FormatKV("name", run.benchmark_name)
         << ",\n";
+    if (run.error_occurred) {
+        out << indent
+            << FormatKV("error_occurred", run.error_occurred)
+            << ",\n";
+        out << indent
+            << FormatKV("error_message", run.error_message)
+            << ",\n";
+    }
+  if (!run.report_big_o && !run.report_rms) {
+        out << indent
+            << FormatKV("iterations", run.iterations)
+            << ",\n";
+        out << indent
+            << FormatKV("real_time", RoundDouble(run.GetAdjustedRealTime()))
+            << ",\n";
+        out << indent
+            << FormatKV("cpu_time", RoundDouble(run.GetAdjustedCPUTime()));
+        out << ",\n" << indent
+            << FormatKV("time_unit", GetTimeUnitString(run.time_unit));
+  } else if (run.report_big_o) {
     out << indent
-        << FormatKV("iterations", run.iterations)
+        << FormatKV("cpu_coefficient", RoundDouble(run.GetAdjustedCPUTime()))
         << ",\n";
     out << indent
-        << FormatKV("real_time", RoundDouble(real_time))
+        << FormatKV("real_coefficient", RoundDouble(run.GetAdjustedRealTime()))
         << ",\n";
     out << indent
-        << FormatKV("cpu_time", RoundDouble(cpu_time));
-    if (run.bytes_per_second > 0.0) {
-        out << ",\n" << indent
-            << FormatKV("bytes_per_second", RoundDouble(run.bytes_per_second));
-    }
-    if (run.items_per_second > 0.0) {
-        out << ",\n" << indent
-            << FormatKV("items_per_second", RoundDouble(run.items_per_second));
-    }
-    if (!run.report_label.empty()) {
-        out << ",\n" << indent
-            << FormatKV("label", run.report_label);
-    }
-    out << '\n';
+            << FormatKV("big_o", GetBigOString(run.complexity))
+            << ",\n";
+        out << indent
+            << FormatKV("time_unit", GetTimeUnitString(run.time_unit));
+    } else if(run.report_rms) {
+        out << indent
+            << FormatKV("rms", RoundDouble(run.GetAdjustedCPUTime()*100))
+            << '%';
+  }
+  if (run.bytes_per_second > 0.0) {
+    out << ",\n"
+        << indent
+        << FormatKV("bytes_per_second", RoundDouble(run.bytes_per_second));
+  }
+  if (run.items_per_second > 0.0) {
+    out << ",\n"
+        << indent
+        << FormatKV("items_per_second", RoundDouble(run.items_per_second));
+  }
+  if (!run.report_label.empty()) {
+    out << ",\n"
+        << indent
+        << FormatKV("label", run.report_label);
+  }
+  out << '\n';
 }
 
-} // end namespace benchmark
+}  // end namespace benchmark
diff --git a/src/reporter.cc b/src/reporter.cc
index 4b47e3d..5187859 100644
--- a/src/reporter.cc
+++ b/src/reporter.cc
@@ -13,74 +13,63 @@
 // limitations under the License.
 
 #include "benchmark/reporter.h"
+#include "walltime.h"
 
 #include <cstdlib>
+
+#include <iostream>
 #include <vector>
+#include <tuple>
 
 #include "check.h"
 #include "stat.h"
 
 namespace benchmark {
 
-void BenchmarkReporter::ComputeStats(
-    const std::vector<Run>& reports,
-    Run* mean_data, Run* stddev_data) {
-  CHECK(reports.size() >= 2) << "Cannot compute stats for less than 2 reports";
-  // Accumulators.
-  Stat1_d real_accumulated_time_stat;
-  Stat1_d cpu_accumulated_time_stat;
-  Stat1_d bytes_per_second_stat;
-  Stat1_d items_per_second_stat;
-  // All repetitions should be run with the same number of iterations so we
-  // can take this information from the first benchmark.
-  int64_t const run_iterations = reports.front().iterations;
-
-  // Populate the accumulators.
-  for (Run const& run : reports) {
-    CHECK_EQ(reports[0].benchmark_name, run.benchmark_name);
-    CHECK_EQ(run_iterations, run.iterations);
-    real_accumulated_time_stat +=
-        Stat1_d(run.real_accumulated_time/run.iterations, run.iterations);
-    cpu_accumulated_time_stat +=
-        Stat1_d(run.cpu_accumulated_time/run.iterations, run.iterations);
-    items_per_second_stat += Stat1_d(run.items_per_second, run.iterations);
-    bytes_per_second_stat += Stat1_d(run.bytes_per_second, run.iterations);
-  }
-
-  // Get the data from the accumulator to BenchmarkReporter::Run's.
-  mean_data->benchmark_name = reports[0].benchmark_name + "_mean";
-  mean_data->iterations = run_iterations;
-  mean_data->real_accumulated_time = real_accumulated_time_stat.Mean() *
-                                     run_iterations;
-  mean_data->cpu_accumulated_time = cpu_accumulated_time_stat.Mean() *
-                                    run_iterations;
-  mean_data->bytes_per_second = bytes_per_second_stat.Mean();
-  mean_data->items_per_second = items_per_second_stat.Mean();
-
-  // Only add label to mean/stddev if it is same for all runs
-  mean_data->report_label = reports[0].report_label;
-  for (std::size_t i = 1; i < reports.size(); i++) {
-    if (reports[i].report_label != reports[0].report_label) {
-      mean_data->report_label = "";
-      break;
-    }
-  }
-
-  stddev_data->benchmark_name = reports[0].benchmark_name + "_stddev";
-  stddev_data->report_label = mean_data->report_label;
-  stddev_data->iterations = 0;
-  stddev_data->real_accumulated_time =
-      real_accumulated_time_stat.StdDev();
-  stddev_data->cpu_accumulated_time =
-      cpu_accumulated_time_stat.StdDev();
-  stddev_data->bytes_per_second = bytes_per_second_stat.StdDev();
-  stddev_data->items_per_second = items_per_second_stat.StdDev();
-}
-
-void BenchmarkReporter::Finalize() {
+BenchmarkReporter::BenchmarkReporter()
+    : output_stream_(&std::cout), error_stream_(&std::cerr)
+{
 }
 
 BenchmarkReporter::~BenchmarkReporter() {
 }
 
+void BenchmarkReporter::PrintBasicContext(std::ostream *out_ptr,
+                                          Context const &context) {
+  CHECK(out_ptr) << "cannot be null";
+  auto& Out = *out_ptr;
+
+  Out << "Run on (" << context.num_cpus << " X " << context.mhz_per_cpu
+            << " MHz CPU " << ((context.num_cpus > 1) ? "s" : "") << ")\n";
+
+  Out << LocalDateTimeString() << "\n";
+
+  if (context.cpu_scaling_enabled) {
+    Out << "***WARNING*** CPU scaling is enabled, the benchmark "
+                 "real time measurements may be noisy and will incur extra "
+                 "overhead.\n";
+  }
+
+#ifndef NDEBUG
+  Out << "***WARNING*** Library was built as DEBUG. Timings may be "
+               "affected.\n";
+#endif
+}
+
+double BenchmarkReporter::Run::GetAdjustedRealTime() const {
+  double new_time = real_accumulated_time * GetTimeUnitMultiplier(time_unit);
+  if (iterations != 0)
+    new_time /= static_cast<double>(iterations);
+  return new_time;
+}
+
+double BenchmarkReporter::Run::GetAdjustedCPUTime() const {
+  double new_time = cpu_accumulated_time * GetTimeUnitMultiplier(time_unit);
+  if (iterations != 0)
+    new_time /= static_cast<double>(iterations);
+  return new_time;
+}
+
+
+
 } // end namespace benchmark
diff --git a/src/sysinfo.cc b/src/sysinfo.cc
index e10e19d..3a5d942 100644
--- a/src/sysinfo.cc
+++ b/src/sysinfo.cc
@@ -239,6 +239,7 @@
   }
 // TODO: also figure out cpuinfo_num_cpus
 
+
 #elif defined BENCHMARK_OS_WINDOWS
   // In NT, read MHz from the registry. If we fail to do so or we're in win9x
   // then make a crude estimate.
@@ -251,7 +252,10 @@
     cpuinfo_cycles_per_second = static_cast<double>((int64_t)data * (int64_t)(1000 * 1000));  // was mhz
   else
     cpuinfo_cycles_per_second = static_cast<double>(EstimateCyclesPerSecond());
-// TODO: also figure out cpuinfo_num_cpus
+
+  SYSTEM_INFO sysinfo = { 0 };
+  GetSystemInfo(&sysinfo);
+  cpuinfo_num_cpus = sysinfo.dwNumberOfProcessors; // number of logical processors in the current group
 
 #elif defined BENCHMARK_OS_MACOSX
   // returning "mach time units" per second. the current number of elapsed
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index a10a53a..aeb720a 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -2,10 +2,6 @@
 
 find_package(Threads REQUIRED)
 
-set(CXX03_FLAGS "${CMAKE_CXX_FLAGS}")
-string(REPLACE "-std=c++11" "-std=c++03" CXX03_FLAGS "${CXX03_FLAGS}")
-string(REPLACE "-std=c++0x" "-std=c++03" CXX03_FLAGS "${CXX03_FLAGS}")
-
 macro(compile_benchmark_test name)
   add_executable(${name} "${name}.cc")
   target_link_libraries(${name} benchmark ${CMAKE_THREAD_LIBS_INIT})
@@ -18,6 +14,7 @@
 compile_benchmark_test(filter_test)
 macro(add_filter_test name filter expect)
   add_test(${name} filter_test --benchmark_min_time=0.01 --benchmark_filter=${filter} ${expect})
+  add_test(${name}_list_only filter_test --benchmark_list_tests --benchmark_filter=${filter} ${expect})
 endmacro(add_filter_test)
 
 add_filter_test(filter_simple "Foo" 3)
@@ -36,16 +33,38 @@
 compile_benchmark_test(basic_test)
 add_test(basic_benchmark basic_test --benchmark_min_time=0.01)
 
+compile_benchmark_test(diagnostics_test)
+add_test(diagnostics_test diagnostics_test --benchmark_min_time=0.01)
+
+compile_benchmark_test(skip_with_error_test)
+add_test(skip_with_error_test skip_with_error_test --benchmark_min_time=0.01)
+
+compile_benchmark_test(donotoptimize_test)
+add_test(donotoptimize_test donotoptimize_test --benchmark_min_time=0.01)
+
 compile_benchmark_test(fixture_test)
 add_test(fixture_test fixture_test --benchmark_min_time=0.01)
 
 compile_benchmark_test(map_test)
 add_test(map_test map_test --benchmark_min_time=0.01)
 
-compile_benchmark_test(cxx03_test)
-set_target_properties(cxx03_test
-    PROPERTIES COMPILE_FLAGS "${CXX03_FLAGS}")
-add_test(cxx03 cxx03_test --benchmark_min_time=0.01)
+compile_benchmark_test(reporter_output_test)
+add_test(reporter_output_test reporter_output_test --benchmark_min_time=0.01)
+
+check_cxx_compiler_flag(-std=c++03 BENCHMARK_HAS_CXX03_FLAG)
+if (BENCHMARK_HAS_CXX03_FLAG)
+  set(CXX03_FLAGS "${CMAKE_CXX_FLAGS}")
+  string(REPLACE "-std=c++11" "-std=c++03" CXX03_FLAGS "${CXX03_FLAGS}")
+  string(REPLACE "-std=c++0x" "-std=c++03" CXX03_FLAGS "${CXX03_FLAGS}")
+
+  compile_benchmark_test(cxx03_test)
+  set_target_properties(cxx03_test
+      PROPERTIES COMPILE_FLAGS "${CXX03_FLAGS}")
+  add_test(cxx03 cxx03_test --benchmark_min_time=0.01)
+endif()
+
+compile_benchmark_test(complexity_test)
+add_test(complexity_benchmark complexity_test --benchmark_min_time=0.01)
 
 # Add the coverage command(s)
 if(CMAKE_BUILD_TYPE)
@@ -66,7 +85,7 @@
       COMMAND ${LCOV} -q -a before.lcov -a after.lcov --output-file final.lcov
       COMMAND ${LCOV} -q -r final.lcov "'${CMAKE_SOURCE_DIR}/test/*'" -o final.lcov
       COMMAND ${GENHTML} final.lcov -o lcov --demangle-cpp --sort -p "${CMAKE_BINARY_DIR}" -t benchmark
-      DEPENDS filter_test benchmark_test options_test basic_test fixture_test cxx03_test
+      DEPENDS filter_test benchmark_test options_test basic_test fixture_test cxx03_test complexity_test
       WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
       COMMENT "Running LCOV"
     )
diff --git a/test/benchmark_test.cc b/test/benchmark_test.cc
index 97abb68..66f5956 100644
--- a/test/benchmark_test.cc
+++ b/test/benchmark_test.cc
@@ -14,6 +14,9 @@
 #include <sstream>
 #include <string>
 #include <vector>
+#include <chrono>
+#include <thread>
+#include <utility>
 
 #if defined(__GNUC__)
 # define BENCHMARK_NOINLINE __attribute__((noinline))
@@ -174,5 +177,48 @@
 }
 BENCHMARK(BM_ParallelMemset)->Arg(10 << 20)->ThreadRange(1, 4);
 
+static void BM_ManualTiming(benchmark::State& state) {
+  size_t slept_for = 0;
+  int microseconds = state.range_x();
+  std::chrono::duration<double, std::micro> sleep_duration {
+    static_cast<double>(microseconds)
+  };
+
+  while (state.KeepRunning()) {
+    auto start   = std::chrono::high_resolution_clock::now();
+    // Simulate some useful workload with a sleep
+    std::this_thread::sleep_for(std::chrono::duration_cast<
+      std::chrono::nanoseconds>(sleep_duration));
+    auto end     = std::chrono::high_resolution_clock::now();
+
+    auto elapsed =
+      std::chrono::duration_cast<std::chrono::duration<double>>(
+        end - start);
+
+    state.SetIterationTime(elapsed.count());
+    slept_for += microseconds;
+  }
+  state.SetItemsProcessed(slept_for);
+}
+BENCHMARK(BM_ManualTiming)->Range(1, 1 << 14)->UseRealTime();
+BENCHMARK(BM_ManualTiming)->Range(1, 1 << 14)->UseManualTime();
+
+#if __cplusplus >= 201103L
+
+template <class ...Args>
+void BM_with_args(benchmark::State& state, Args&&...) {
+  while (state.KeepRunning()) {}
+}
+BENCHMARK_CAPTURE(BM_with_args, int_test, 42, 43, 44);
+BENCHMARK_CAPTURE(BM_with_args, string_and_pair_test,
+                  std::string("abc"), std::pair<int, double>(42, 3.8));
+
+void BM_non_template_args(benchmark::State& state, int, double) {
+  while(state.KeepRunning()) {}
+}
+BENCHMARK_CAPTURE(BM_non_template_args, basic_test, 0, 0);
+
+#endif // __cplusplus >= 201103L
+
 BENCHMARK_MAIN()
 
diff --git a/test/complexity_test.cc b/test/complexity_test.cc
new file mode 100644
index 0000000..8ab88f9
--- /dev/null
+++ b/test/complexity_test.cc
@@ -0,0 +1,297 @@
+
+#undef NDEBUG
+#include "benchmark/benchmark.h"
+#include "../src/check.h" // NOTE: check.h is for internal use only!
+#include "../src/re.h"    // NOTE: re.h is for internal use only
+#include <cassert>
+#include <cstring>
+#include <iostream>
+#include <sstream>
+#include <vector>
+#include <utility>
+#include <algorithm>
+#include <cmath>
+
+namespace {
+
+// ========================================================================= //
+// -------------------------- Testing Case --------------------------------- //
+// ========================================================================= //
+
+enum MatchRules {
+  MR_Default, // Skip non-matching lines until a match is found.
+  MR_Next    // Match must occur on the next line.
+};
+
+struct TestCase {
+  std::string regex;
+  int match_rule;
+
+  TestCase(std::string re, int rule = MR_Default) : regex(re), match_rule(rule) {}
+
+  void Check(std::stringstream& remaining_output) const {
+    benchmark::Regex r;
+    std::string err_str;
+    r.Init(regex, &err_str);
+    CHECK(err_str.empty()) << "Could not construct regex \"" << regex << "\""
+                           << " got Error: " << err_str;
+
+    std::string line;
+    while (remaining_output.eof() == false) {
+        CHECK(remaining_output.good());
+        std::getline(remaining_output, line);
+        if (r.Match(line)) return;
+        CHECK(match_rule != MR_Next) << "Expected line \"" << line
+                                     << "\" to match regex \"" << regex << "\"";
+    }
+
+    CHECK(remaining_output.eof() == false)
+        << "End of output reached before match for regex \"" << regex
+        << "\" was found";
+  }
+};
+
+std::vector<TestCase> ConsoleOutputTests;
+std::vector<TestCase> JSONOutputTests;
+std::vector<TestCase> CSVOutputTests;
+
+// ========================================================================= //
+// -------------------------- Test Helpers --------------------------------- //
+// ========================================================================= //
+
+class TestReporter : public benchmark::BenchmarkReporter {
+public:
+  TestReporter(std::vector<benchmark::BenchmarkReporter*> reps)
+      : reporters_(reps)  {}
+
+  virtual bool ReportContext(const Context& context) {
+    bool last_ret = false;
+    bool first = true;
+    for (auto rep : reporters_) {
+      bool new_ret = rep->ReportContext(context);
+      CHECK(first || new_ret == last_ret)
+          << "Reports return different values for ReportContext";
+      first = false;
+      last_ret = new_ret;
+    }
+    return last_ret;
+  }
+
+  virtual void ReportRuns(const std::vector<Run>& report) {
+    for (auto rep : reporters_)
+      rep->ReportRuns(report);
+  }
+
+  virtual void Finalize() {
+      for (auto rep : reporters_)
+        rep->Finalize();
+  }
+
+private:
+  std::vector<benchmark::BenchmarkReporter*> reporters_;
+};
+
+
+#define CONCAT2(x, y) x##y
+#define CONCAT(x, y) CONCAT2(x, y)
+
+#define ADD_CASES(...) \
+    int CONCAT(dummy, __LINE__) = AddCases(__VA_ARGS__)
+
+int AddCases(std::vector<TestCase>* out, std::initializer_list<TestCase> const& v) {
+  for (auto const& TC : v)
+    out->push_back(TC);
+  return 0;
+}
+
+template <class First>
+std::string join(First f) { return f; }
+
+template <class First, class ...Args>
+std::string join(First f, Args&&... args) {
+    return std::string(std::move(f)) + "[ ]+" + join(std::forward<Args>(args)...);
+}
+
+std::string dec_re = "[0-9]+\\.[0-9]+";
+
+#define ADD_COMPLEXITY_CASES(...) \
+    int CONCAT(dummy, __LINE__) = AddComplexityTest(__VA_ARGS__)
+
+int AddComplexityTest(std::vector<TestCase>* console_out, std::vector<TestCase>* json_out,
+                      std::vector<TestCase>* csv_out, std::string big_o_test_name, 
+                      std::string rms_test_name, std::string big_o) {
+  std::string big_o_str = dec_re + " " + big_o;
+  AddCases(console_out, {
+    {join("^" + big_o_test_name + "", big_o_str, big_o_str) + "[ ]*$"},
+    {join("^" + rms_test_name + "", "[0-9]+ %", "[0-9]+ %") + "[ ]*$"}
+  });
+  AddCases(json_out, {
+    {"\"name\": \"" + big_o_test_name + "\",$"},
+    {"\"cpu_coefficient\": [0-9]+,$", MR_Next},
+    {"\"real_coefficient\": [0-9]{1,5},$", MR_Next},
+    {"\"big_o\": \"" + big_o + "\",$", MR_Next},
+    {"\"time_unit\": \"ns\"$", MR_Next},
+    {"}", MR_Next},
+    {"\"name\": \"" + rms_test_name + "\",$"},
+    {"\"rms\": [0-9]+%$", MR_Next},
+    {"}", MR_Next}
+  });
+  AddCases(csv_out, {
+    {"^\"" + big_o_test_name + "\",," + dec_re + "," + dec_re + "," + big_o + ",,,,,$"},
+    {"^\"" + rms_test_name + "\",," + dec_re + "," + dec_re + ",,,,,,$"}
+  });
+  return 0;
+}
+
+}  // end namespace
+
+// ========================================================================= //
+// --------------------------- Testing BigO O(1) --------------------------- //
+// ========================================================================= //
+
+void BM_Complexity_O1(benchmark::State& state) {
+  while (state.KeepRunning()) {
+  }
+  state.SetComplexityN(state.range_x());
+}
+BENCHMARK(BM_Complexity_O1) -> Range(1, 1<<18) -> Complexity(benchmark::o1);
+BENCHMARK(BM_Complexity_O1) -> Range(1, 1<<18) -> Complexity([](int){return 1.0; });
+BENCHMARK(BM_Complexity_O1) -> Range(1, 1<<18) -> Complexity();
+
+const char* big_o_1_test_name = "BM_Complexity_O1_BigO";
+const char* rms_o_1_test_name = "BM_Complexity_O1_RMS";
+const char* enum_auto_big_o_1 = "\\([0-9]+\\)";
+const char* lambda_big_o_1 = "f\\(N\\)";
+
+// Add enum tests
+ADD_COMPLEXITY_CASES(&ConsoleOutputTests, &JSONOutputTests, &CSVOutputTests, 
+                     big_o_1_test_name, rms_o_1_test_name, enum_auto_big_o_1);
+
+// Add lambda tests
+ADD_COMPLEXITY_CASES(&ConsoleOutputTests, &JSONOutputTests, &CSVOutputTests, 
+                     big_o_1_test_name, rms_o_1_test_name, lambda_big_o_1);
+
+// ========================================================================= //
+// --------------------------- Testing BigO O(N) --------------------------- //
+// ========================================================================= //
+
+std::vector<int> ConstructRandomVector(int size) {
+  std::vector<int> v;
+  v.reserve(size);
+  for (int i = 0; i < size; ++i) {
+    v.push_back(rand() % size);
+  }
+  return v;
+}
+
+void BM_Complexity_O_N(benchmark::State& state) {
+  auto v = ConstructRandomVector(state.range_x());
+  const int item_not_in_vector = state.range_x()*2; // Test worst case scenario (item not in vector)
+  while (state.KeepRunning()) {
+      benchmark::DoNotOptimize(std::find(v.begin(), v.end(), item_not_in_vector));
+  }
+  state.SetComplexityN(state.range_x());
+}
+BENCHMARK(BM_Complexity_O_N) -> RangeMultiplier(2) -> Range(1<<10, 1<<16) -> Complexity(benchmark::oN);
+BENCHMARK(BM_Complexity_O_N) -> RangeMultiplier(2) -> Range(1<<10, 1<<16) -> Complexity([](int n) -> double{return n; });
+BENCHMARK(BM_Complexity_O_N) -> RangeMultiplier(2) -> Range(1<<10, 1<<16) -> Complexity();
+
+const char* big_o_n_test_name = "BM_Complexity_O_N_BigO";
+const char* rms_o_n_test_name = "BM_Complexity_O_N_RMS";
+const char* enum_auto_big_o_n = "N";
+const char* lambda_big_o_n = "f\\(N\\)";
+
+// Add enum tests
+ADD_COMPLEXITY_CASES(&ConsoleOutputTests, &JSONOutputTests, &CSVOutputTests, 
+                     big_o_n_test_name, rms_o_n_test_name, enum_auto_big_o_n);
+
+// Add lambda tests
+ADD_COMPLEXITY_CASES(&ConsoleOutputTests, &JSONOutputTests, &CSVOutputTests, 
+                     big_o_n_test_name, rms_o_n_test_name, lambda_big_o_n);
+
+// ========================================================================= //
+// ------------------------- Testing BigO O(N*lgN) ------------------------- //
+// ========================================================================= //
+
+static void BM_Complexity_O_N_log_N(benchmark::State& state) {
+  auto v = ConstructRandomVector(state.range_x());
+  while (state.KeepRunning()) {
+      std::sort(v.begin(), v.end());
+  }
+  state.SetComplexityN(state.range_x());
+}
+BENCHMARK(BM_Complexity_O_N_log_N) -> RangeMultiplier(2) -> Range(1<<10, 1<<16) -> Complexity(benchmark::oNLogN);
+BENCHMARK(BM_Complexity_O_N_log_N) -> RangeMultiplier(2) -> Range(1<<10, 1<<16) -> Complexity([](int n) {return n * std::log2(n); });
+BENCHMARK(BM_Complexity_O_N_log_N) -> RangeMultiplier(2) -> Range(1<<10, 1<<16) -> Complexity();
+
+const char* big_o_n_lg_n_test_name = "BM_Complexity_O_N_log_N_BigO";
+const char* rms_o_n_lg_n_test_name = "BM_Complexity_O_N_log_N_RMS";
+const char* enum_auto_big_o_n_lg_n = "NlgN";
+const char* lambda_big_o_n_lg_n = "f\\(N\\)";
+
+// Add enum tests
+ADD_COMPLEXITY_CASES(&ConsoleOutputTests, &JSONOutputTests, &CSVOutputTests, 
+                     big_o_n_lg_n_test_name, rms_o_n_lg_n_test_name, enum_auto_big_o_n_lg_n);
+
+// Add lambda tests
+ADD_COMPLEXITY_CASES(&ConsoleOutputTests, &JSONOutputTests, &CSVOutputTests, 
+                     big_o_n_lg_n_test_name, rms_o_n_lg_n_test_name, lambda_big_o_n_lg_n);
+
+
+// ========================================================================= //
+// --------------------------- TEST CASES END ------------------------------ //
+// ========================================================================= //
+
+
+int main(int argc, char* argv[]) {
+  // Add --color_print=false to argv since we don't want to match color codes.
+  char new_arg[64];
+  char* new_argv[64];
+  std::copy(argv, argv + argc, new_argv);
+  new_argv[argc++] = std::strcpy(new_arg, "--color_print=false");
+  benchmark::Initialize(&argc, new_argv);
+
+  benchmark::ConsoleReporter CR;
+  benchmark::JSONReporter JR;
+  benchmark::CSVReporter CSVR;
+  struct ReporterTest {
+    const char* name;
+    std::vector<TestCase>& output_cases;
+    benchmark::BenchmarkReporter& reporter;
+    std::stringstream out_stream;
+    std::stringstream err_stream;
+
+    ReporterTest(const char* n,
+                 std::vector<TestCase>& out_tc,
+                 benchmark::BenchmarkReporter& br)
+        : name(n), output_cases(out_tc), reporter(br) {
+        reporter.SetOutputStream(&out_stream);
+        reporter.SetErrorStream(&err_stream);
+    }
+  } TestCases[] = {
+      {"ConsoleReporter", ConsoleOutputTests, CR},
+      {"JSONReporter", JSONOutputTests, JR},
+      {"CSVReporter", CSVOutputTests, CSVR}
+  };
+
+  // Create the test reporter and run the benchmarks.
+  std::cout << "Running benchmarks...\n";
+  TestReporter test_rep({&CR, &JR, &CSVR});
+  benchmark::RunSpecifiedBenchmarks(&test_rep);
+
+  for (auto& rep_test : TestCases) {
+      std::string msg = std::string("\nTesting ") + rep_test.name + " Output\n";
+      std::string banner(msg.size() - 1, '-');
+      std::cout << banner << msg << banner << "\n";
+
+      std::cerr << rep_test.err_stream.str();
+      std::cout << rep_test.out_stream.str();
+
+      for (const auto& TC : rep_test.output_cases)
+        TC.Check(rep_test.out_stream);
+
+      std::cout << "\n";
+  }
+  return 0;
+}
+
diff --git a/test/diagnostics_test.cc b/test/diagnostics_test.cc
new file mode 100644
index 0000000..60fa3b1
--- /dev/null
+++ b/test/diagnostics_test.cc
@@ -0,0 +1,61 @@
+// Testing:
+//   State::PauseTiming()
+//   State::ResumeTiming()
+// Test that CHECK's within these function diagnose when they are called
+// outside of the KeepRunning() loop.
+//
+// NOTE: Users should NOT include or use src/check.h. This is only done in
+// order to test library internals.
+
+#include "benchmark/benchmark_api.h"
+#include "../src/check.h"
+#include <stdexcept>
+#include <cstdlib>
+
+#if defined(__GNUC__) && !defined(__EXCEPTIONS)
+#define TEST_HAS_NO_EXCEPTIONS
+#endif
+
+void TestHandler() {
+#ifndef TEST_HAS_NO_EXCEPTIONS
+  throw std::logic_error("");
+#else
+  std::abort();
+#endif
+}
+
+void try_invalid_pause_resume(benchmark::State& state) {
+#if !defined(NDEBUG) && !defined(TEST_HAS_NO_EXCEPTIONS)
+  try {
+    state.PauseTiming();
+    std::abort();
+  } catch (std::logic_error const&) {}
+  try {
+    state.ResumeTiming();
+    std::abort();
+  } catch (std::logic_error const&) {}
+#else
+  (void)state; // avoid unused warning
+#endif
+}
+
+void BM_diagnostic_test(benchmark::State& state) {
+  static bool called_once = false;
+
+  if (called_once == false) try_invalid_pause_resume(state);
+
+  while (state.KeepRunning()) {
+    benchmark::DoNotOptimize(state.iterations());
+  }
+
+  if (called_once == false) try_invalid_pause_resume(state);
+
+  called_once = true;
+}
+BENCHMARK(BM_diagnostic_test);
+
+int main(int argc, char** argv) {
+  benchmark::internal::GetAbortHandler() = &TestHandler;
+  benchmark::Initialize(&argc, argv);
+  benchmark::RunSpecifiedBenchmarks();
+}
diff --git a/test/donotoptimize_test.cc b/test/donotoptimize_test.cc
new file mode 100644
index 0000000..e4453fb
--- /dev/null
+++ b/test/donotoptimize_test.cc
@@ -0,0 +1,36 @@
+#include "benchmark/benchmark.h"
+
+#include <cstdint>
+
+namespace {
+#if defined(__GNUC__)
+  std::uint64_t double_up(const std::uint64_t x) __attribute__ ((const));
+#endif
+  std::uint64_t double_up(const std::uint64_t x) {
+    return x * 2;
+  }
+}
+
+int main(int, char*[]) {
+
+  // this test verifies compilation of DoNotOptimize() for some types
+
+  char buffer8[8];
+  benchmark::DoNotOptimize(buffer8);
+
+  char buffer20[20];
+  benchmark::DoNotOptimize(buffer20);
+
+  char buffer1024[1024];
+  benchmark::DoNotOptimize(buffer1024);
+  benchmark::DoNotOptimize(&buffer1024[0]);
+
+  int x = 123;
+  benchmark::DoNotOptimize(x);
+  benchmark::DoNotOptimize(&x);
+  benchmark::DoNotOptimize(x += 42);
+
+  benchmark::DoNotOptimize(double_up(x));
+
+  return 0;
+}
diff --git a/test/filter_test.cc b/test/filter_test.cc
index 2a278ff..0ba4071 100644
--- a/test/filter_test.cc
+++ b/test/filter_test.cc
@@ -68,24 +68,38 @@
 
 
 
-int main(int argc, char* argv[]) {
+int main(int argc, char** argv) {
+  bool list_only = false;
+  for (int i=0; i < argc; ++i)
+    list_only |= std::string(argv[i]).find("--benchmark_list_tests") != std::string::npos;
+
   benchmark::Initialize(&argc, argv);
 
   TestReporter test_reporter;
-  benchmark::RunSpecifiedBenchmarks(&test_reporter);
+  const size_t returned_count = benchmark::RunSpecifiedBenchmarks(&test_reporter);
 
   if (argc == 2) {
     // Make sure we ran all of the tests
     std::stringstream ss(argv[1]);
-    size_t expected;
-    ss >> expected;
+    size_t expected_return;
+    ss >> expected_return;
 
-    const size_t count = test_reporter.GetCount();
-    if (count != expected) {
-      std::cerr << "ERROR: Expected " << expected << " tests to be ran but only "
-                << count << " completed" << std::endl;
+    if (returned_count != expected_return) {
+      std::cerr << "ERROR: Expected " << expected_return
+                << " tests to match the filter but returned_count = "
+                << returned_count << std::endl;
+      return -1;
+    }
+
+    const size_t expected_reports = list_only ? 0 : expected_return;
+    const size_t reports_count = test_reporter.GetCount();
+    if (reports_count != expected_reports) {
+      std::cerr << "ERROR: Expected " << expected_reports
+                << " tests to be run but reported_count = " << reports_count
+                << std::endl;
       return -1;
     }
   }
+
   return 0;
 }
diff --git a/test/fixture_test.cc b/test/fixture_test.cc
index 92fbc4c..bf800fd 100644
--- a/test/fixture_test.cc
+++ b/test/fixture_test.cc
@@ -6,14 +6,18 @@
 
 class MyFixture : public ::benchmark::Fixture {
  public:
-  void SetUp(const ::benchmark::State&) {
-    assert(data.get() == nullptr);
-    data.reset(new int(42));
+  void SetUp(const ::benchmark::State& state) {
+    if (state.thread_index == 0) {
+      assert(data.get() == nullptr);
+      data.reset(new int(42));
+    }
   }
 
-  void TearDown() {
-    assert(data.get() != nullptr);
-    data.release();
+  void TearDown(const ::benchmark::State& state) {
+    if (state.thread_index == 0) {
+      assert(data.get() != nullptr);
+      data.reset();
+    }
   }
 
   ~MyFixture() {
@@ -32,10 +36,17 @@
 }
 
 BENCHMARK_DEFINE_F(MyFixture, Bar)(benchmark::State& st) {
+  if (st.thread_index == 0) {
+    assert(data.get() != nullptr);
+    assert(*data == 42);
+  }
   while (st.KeepRunning()) {
+    assert(data.get() != nullptr);
+    assert(*data == 42);
   }
   st.SetItemsProcessed(st.range_x());
 }
 BENCHMARK_REGISTER_F(MyFixture, Bar)->Arg(42);
+BENCHMARK_REGISTER_F(MyFixture, Bar)->Arg(42)->ThreadPerCpu();
 
 BENCHMARK_MAIN()
diff --git a/test/map_test.cc b/test/map_test.cc
index 8d5f6ec..5eccf8d 100644
--- a/test/map_test.cc
+++ b/test/map_test.cc
@@ -1,5 +1,6 @@
 #include "benchmark/benchmark.h"
 
+#include <cstdlib>
 #include <map>
 
 namespace {
@@ -36,7 +37,7 @@
     m = ConstructRandomMap(st.range_x());
   }
 
-  void TearDown() {
+  void TearDown(const ::benchmark::State&) {
     m.clear();
   }
 
diff --git a/test/options_test.cc b/test/options_test.cc
index d4c682d..78cedae 100644
--- a/test/options_test.cc
+++ b/test/options_test.cc
@@ -1,12 +1,29 @@
 #include "benchmark/benchmark_api.h"
 
+#include <chrono>
+#include <thread>
+
 void BM_basic(benchmark::State& state) {
   while (state.KeepRunning()) {
   }
 }
+
+void BM_basic_slow(benchmark::State& state) {
+  std::chrono::milliseconds sleep_duration(state.range_x());
+  while (state.KeepRunning()) {
+    std::this_thread::sleep_for(
+      std::chrono::duration_cast<std::chrono::nanoseconds>(sleep_duration)
+      );
+  }
+}
+
 BENCHMARK(BM_basic);
 BENCHMARK(BM_basic)->Arg(42);
+BENCHMARK(BM_basic_slow)->Arg(10)->Unit(benchmark::kNanosecond);
+BENCHMARK(BM_basic_slow)->Arg(100)->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_basic_slow)->Arg(1000)->Unit(benchmark::kMillisecond);
 BENCHMARK(BM_basic)->Range(1, 8);
+BENCHMARK(BM_basic)->RangeMultiplier(2)->Range(1, 8);
 BENCHMARK(BM_basic)->DenseRange(10, 15);
 BENCHMARK(BM_basic)->ArgPair(42, 42);
 BENCHMARK(BM_basic)->RangePair(64, 512, 64, 512);
@@ -14,6 +31,7 @@
 BENCHMARK(BM_basic)->UseRealTime();
 BENCHMARK(BM_basic)->ThreadRange(2, 4);
 BENCHMARK(BM_basic)->ThreadPerCpu();
+BENCHMARK(BM_basic)->Repetitions(3);
 
 void CustomArgs(benchmark::internal::Benchmark* b) {
   for (int i = 0; i < 10; ++i) {
diff --git a/test/reporter_output_test.cc b/test/reporter_output_test.cc
new file mode 100644
index 0000000..b3898ac
--- /dev/null
+++ b/test/reporter_output_test.cc
@@ -0,0 +1,259 @@
+
+#undef NDEBUG
+#include "benchmark/benchmark.h"
+#include "../src/check.h" // NOTE: check.h is for internal use only!
+#include "../src/re.h" // NOTE: re.h is for internal use only
+#include <cassert>
+#include <cstring>
+#include <iostream>
+#include <sstream>
+#include <vector>
+#include <utility>
+
+namespace {
+
+// ========================================================================= //
+// -------------------------- Testing Case --------------------------------- //
+// ========================================================================= //
+
+enum MatchRules {
+  MR_Default, // Skip non-matching lines until a match is found.
+  MR_Next    // Match must occur on the next line.
+};
+
+struct TestCase {
+  std::string regex;
+  int match_rule;
+
+  TestCase(std::string re, int rule = MR_Default) : regex(re), match_rule(rule) {}
+
+  void Check(std::stringstream& remaining_output) const {
+    benchmark::Regex r;
+    std::string err_str;
+    r.Init(regex, &err_str);
+    CHECK(err_str.empty()) << "Could not construct regex \"" << regex << "\""
+                           << " got Error: " << err_str;
+
+    std::string line;
+    while (remaining_output.eof() == false) {
+        CHECK(remaining_output.good());
+        std::getline(remaining_output, line);
+        if (r.Match(line)) return;
+        CHECK(match_rule != MR_Next) << "Expected line \"" << line
+                                     << "\" to match regex \"" << regex << "\"";
+    }
+
+    CHECK(remaining_output.eof() == false)
+        << "End of output reached before match for regex \"" << regex
+        << "\" was found";
+  }
+};
+
+std::vector<TestCase> ConsoleOutputTests;
+std::vector<TestCase> JSONOutputTests;
+std::vector<TestCase> CSVOutputTests;
+
+std::vector<TestCase> ConsoleErrorTests;
+std::vector<TestCase> JSONErrorTests;
+std::vector<TestCase> CSVErrorTests;
+
+// ========================================================================= //
+// -------------------------- Test Helpers --------------------------------- //
+// ========================================================================= //
+
+class TestReporter : public benchmark::BenchmarkReporter {
+public:
+  TestReporter(std::vector<benchmark::BenchmarkReporter*> reps)
+      : reporters_(reps)  {}
+
+  virtual bool ReportContext(const Context& context) {
+    bool last_ret = false;
+    bool first = true;
+    for (auto rep : reporters_) {
+      bool new_ret = rep->ReportContext(context);
+      CHECK(first || new_ret == last_ret)
+          << "Reports return different values for ReportContext";
+      first = false;
+      last_ret = new_ret;
+    }
+    return last_ret;
+  }
+
+  virtual void ReportRuns(const std::vector<Run>& report) {
+    for (auto rep : reporters_)
+      rep->ReportRuns(report);
+  }
+
+  virtual void Finalize() {
+      for (auto rep : reporters_)
+        rep->Finalize();
+  }
+
+private:
+  std::vector<benchmark::BenchmarkReporter*> reporters_;
+};
+
+
+#define CONCAT2(x, y) x##y
+#define CONCAT(x, y) CONCAT2(x, y)
+
+#define ADD_CASES(...) \
+    int CONCAT(dummy, __LINE__) = AddCases(__VA_ARGS__)
+
+int AddCases(std::vector<TestCase>* out, std::initializer_list<TestCase> const& v) {
+  for (auto const& TC : v)
+    out->push_back(TC);
+  return 0;
+}
+
+template <class First>
+std::string join(First f) { return f; }
+
+template <class First, class ...Args>
+std::string join(First f, Args&&... args) {
+    return std::string(std::move(f)) + "[ ]+" + join(std::forward<Args>(args)...);
+}
+
+std::string dec_re = "[0-9]+\\.[0-9]+";
+
+}  // end namespace
+
+// ========================================================================= //
+// ---------------------- Testing Prologue Output -------------------------- //
+// ========================================================================= //
+
+ADD_CASES(&ConsoleOutputTests, {
+    {join("^Benchmark", "Time", "CPU", "Iterations$"), MR_Next},
+    {"^[-]+$", MR_Next}
+});
+ADD_CASES(&CSVOutputTests, {
+  {"name,iterations,real_time,cpu_time,time_unit,bytes_per_second,items_per_second,"
+    "label,error_occurred,error_message"}
+});
+
+// ========================================================================= //
+// ------------------------ Testing Basic Output --------------------------- //
+// ========================================================================= //
+
+void BM_basic(benchmark::State& state) {
+  while (state.KeepRunning()) {}
+}
+BENCHMARK(BM_basic);
+
+ADD_CASES(&ConsoleOutputTests, {
+    {"^BM_basic[ ]+[0-9]{1,5} ns[ ]+[0-9]{1,5} ns[ ]+[0-9]+$"}
+});
+ADD_CASES(&JSONOutputTests, {
+    {"\"name\": \"BM_basic\",$"},
+    {"\"iterations\": [0-9]+,$", MR_Next},
+    {"\"real_time\": [0-9]{1,5},$", MR_Next},
+    {"\"cpu_time\": [0-9]{1,5},$", MR_Next},
+    {"\"time_unit\": \"ns\"$", MR_Next},
+    {"}", MR_Next}
+});
+ADD_CASES(&CSVOutputTests, {
+    {"^\"BM_basic\",[0-9]+," + dec_re + "," + dec_re + ",ns,,,,,$"}
+});
+
+// ========================================================================= //
+// ------------------------ Testing Error Output --------------------------- //
+// ========================================================================= //
+
+void BM_error(benchmark::State& state) {
+    state.SkipWithError("message");
+    while(state.KeepRunning()) {}
+}
+BENCHMARK(BM_error);
+ADD_CASES(&ConsoleOutputTests, {
+    {"^BM_error[ ]+ERROR OCCURRED: 'message'$"}
+});
+ADD_CASES(&JSONOutputTests, {
+    {"\"name\": \"BM_error\",$"},
+    {"\"error_occurred\": true,$", MR_Next},
+    {"\"error_message\": \"message\",$", MR_Next}
+});
+
+ADD_CASES(&CSVOutputTests, {
+    {"^\"BM_error\",,,,,,,,true,\"message\"$"}
+});
+
+
+// ========================================================================= //
+// ----------------------- Testing Complexity Output ----------------------- //
+// ========================================================================= //
+
+void BM_Complexity_O1(benchmark::State& state) {
+  while (state.KeepRunning()) {
+  }
+  state.SetComplexityN(state.range_x());
+}
+BENCHMARK(BM_Complexity_O1)->Range(1, 1<<18)->Complexity(benchmark::o1);
+
+std::string bigOStr = "[0-9]+\\.[0-9]+ \\([0-9]+\\)";
+
+ADD_CASES(&ConsoleOutputTests, {
+   {join("^BM_Complexity_O1_BigO", bigOStr, bigOStr) + "[ ]*$"},
+   {join("^BM_Complexity_O1_RMS", "[0-9]+ %", "[0-9]+ %") + "[ ]*$"}
+});
+
+
+// ========================================================================= //
+// --------------------------- TEST CASES END ------------------------------ //
+// ========================================================================= //
+
+
+int main(int argc, char* argv[]) {
+  // Add --color_print=false to argv since we don't want to match color codes.
+  char new_arg[64];
+  char* new_argv[64];
+  std::copy(argv, argv + argc, new_argv);
+  new_argv[argc++] = std::strcpy(new_arg, "--color_print=false");
+  benchmark::Initialize(&argc, new_argv);
+
+  benchmark::ConsoleReporter CR;
+  benchmark::JSONReporter JR;
+  benchmark::CSVReporter CSVR;
+  struct ReporterTest {
+    const char* name;
+    std::vector<TestCase>& output_cases;
+    std::vector<TestCase>& error_cases;
+    benchmark::BenchmarkReporter& reporter;
+    std::stringstream out_stream;
+    std::stringstream err_stream;
+
+    ReporterTest(const char* n,
+                 std::vector<TestCase>& out_tc,
+                 std::vector<TestCase>& err_tc,
+                 benchmark::BenchmarkReporter& br)
+        : name(n), output_cases(out_tc), error_cases(err_tc), reporter(br) {
+        reporter.SetOutputStream(&out_stream);
+        reporter.SetErrorStream(&err_stream);
+    }
+  } TestCases[] = {
+      {"ConsoleReporter", ConsoleOutputTests, ConsoleErrorTests, CR},
+      {"JSONReporter", JSONOutputTests, JSONErrorTests, JR},
+      {"CSVReporter", CSVOutputTests, CSVErrorTests, CSVR}
+  };
+
+  // Create the test reporter and run the benchmarks.
+  std::cout << "Running benchmarks...\n";
+  TestReporter test_rep({&CR, &JR, &CSVR});
+  benchmark::RunSpecifiedBenchmarks(&test_rep);
+
+  for (auto& rep_test : TestCases) {
+      std::string msg = std::string("\nTesting ") + rep_test.name + " Output\n";
+      std::string banner(msg.size() - 1, '-');
+      std::cout << banner << msg << banner << "\n";
+
+      std::cerr << rep_test.err_stream.str();
+      std::cout << rep_test.out_stream.str();
+
+      for (const auto& TC : rep_test.error_cases)
+        TC.Check(rep_test.err_stream);
+      for (const auto& TC : rep_test.output_cases)
+        TC.Check(rep_test.out_stream);
+
+      std::cout << "\n";
+  }
+  return 0;
+}
diff --git a/test/skip_with_error_test.cc b/test/skip_with_error_test.cc
new file mode 100644
index 0000000..dafbd64
--- /dev/null
+++ b/test/skip_with_error_test.cc
@@ -0,0 +1,161 @@
+
+#undef NDEBUG
+#include "benchmark/benchmark.h"
+#include "../src/check.h" // NOTE: check.h is for internal use only!
+#include <cassert>
+#include <vector>
+
+namespace {
+
+class TestReporter : public benchmark::ConsoleReporter {
+ public:
+  virtual bool ReportContext(const Context& context) {
+    return ConsoleReporter::ReportContext(context);
+  };
+
+  virtual void ReportRuns(const std::vector<Run>& report) {
+    all_runs_.insert(all_runs_.end(), begin(report), end(report));
+    ConsoleReporter::ReportRuns(report);
+  }
+
+  TestReporter()  {}
+  virtual ~TestReporter() {}
+
+  mutable std::vector<Run> all_runs_;
+};
+
+struct TestCase {
+  std::string name;
+  bool error_occurred;
+  std::string error_message;
+
+  typedef benchmark::BenchmarkReporter::Run Run;
+
+  void CheckRun(Run const& run) const {
+    CHECK(name == run.benchmark_name) << "expected " << name << " got " << run.benchmark_name;
+    CHECK(error_occurred == run.error_occurred);
+    CHECK(error_message == run.error_message);
+    if (error_occurred) {
+      //CHECK(run.iterations == 0);
+    } else {
+      CHECK(run.iterations != 0);
+    }
+  }
+};
+
+std::vector<TestCase> ExpectedResults;
+
+int AddCases(const char* base_name, std::initializer_list<TestCase> const& v) {
+  for (auto TC : v) {
+    TC.name = base_name + TC.name;
+    ExpectedResults.push_back(std::move(TC));
+  }
+  return 0;
+}
+
+#define CONCAT(x, y) CONCAT2(x, y)
+#define CONCAT2(x, y) x##y
+#define ADD_CASES(...) \
+int CONCAT(dummy, __LINE__) = AddCases(__VA_ARGS__)
+
+}  // end namespace
+
+
+void BM_error_before_running(benchmark::State& state) {
+  state.SkipWithError("error message");
+  while (state.KeepRunning()) {
+    assert(false);
+  }
+}
+BENCHMARK(BM_error_before_running);
+ADD_CASES("BM_error_before_running",
+          {{"", true, "error message"}});
+
+void BM_error_during_running(benchmark::State& state) {
+  int first_iter = true;
+  while (state.KeepRunning()) {
+    if (state.range_x() == 1 && state.thread_index <= (state.threads / 2)) {
+      assert(first_iter);
+      first_iter = false;
+      state.SkipWithError("error message");
+    } else {
+      state.PauseTiming();
+      state.ResumeTiming();
+    }
+  }
+}
+BENCHMARK(BM_error_during_running)->Arg(1)->Arg(2)->ThreadRange(1, 8);
+ADD_CASES(
+    "BM_error_during_running",
+    {{"/1/threads:1", true, "error message"},
+    {"/1/threads:2", true, "error message"},
+    {"/1/threads:4", true, "error message"},
+    {"/1/threads:8", true, "error message"},
+    {"/2/threads:1", false, ""},
+    {"/2/threads:2", false, ""},
+    {"/2/threads:4", false, ""},
+    {"/2/threads:8", false, ""}}
+);
+
+void BM_error_after_running(benchmark::State& state) {
+  while (state.KeepRunning()) {
+    benchmark::DoNotOptimize(state.iterations());
+  }
+  if (state.thread_index <= (state.threads / 2))
+    state.SkipWithError("error message");
+}
+BENCHMARK(BM_error_after_running)->ThreadRange(1, 8);
+ADD_CASES(
+    "BM_error_after_running",
+    {{"/threads:1", true, "error message"},
+    {"/threads:2", true, "error message"},
+    {"/threads:4", true, "error message"},
+    {"/threads:8", true, "error message"}}
+);
+
+void BM_error_while_paused(benchmark::State& state) {
+  bool first_iter = true;
+  while (state.KeepRunning()) {
+    if (state.range_x() == 1 && state.thread_index <= (state.threads / 2)) {
+      assert(first_iter);
+      first_iter = false;
+      state.PauseTiming();
+      state.SkipWithError("error message");
+    } else {
+      state.PauseTiming();
+      state.ResumeTiming();
+    }
+  }
+}
+BENCHMARK(BM_error_while_paused)->Arg(1)->Arg(2)->ThreadRange(1, 8);
+ADD_CASES(
+    "BM_error_while_paused",
+    {{"/1/threads:1", true, "error message"},
+    {"/1/threads:2", true, "error message"},
+    {"/1/threads:4", true, "error message"},
+    {"/1/threads:8", true, "error message"},
+    {"/2/threads:1", false, ""},
+    {"/2/threads:2", false, ""},
+    {"/2/threads:4", false, ""},
+    {"/2/threads:8", false, ""}}
+);
+
+
+int main(int argc, char* argv[]) {
+  benchmark::Initialize(&argc, argv);
+
+  TestReporter test_reporter;
+  benchmark::RunSpecifiedBenchmarks(&test_reporter);
+
+  typedef benchmark::BenchmarkReporter::Run Run;
+  auto EB = ExpectedResults.begin();
+
+  for (Run const& run : test_reporter.all_runs_) {
+    assert(EB != ExpectedResults.end());
+    EB->CheckRun(run);
+    ++EB;
+  }
+  assert(EB == ExpectedResults.end());
+
+  return 0;
+}