release-request-f3bcfb00-8080-4b95-b497-7457f544b4f6-for-git_oc-mr1-release-4116097 snap-temp-L44400000075746008

Change-Id: I94b8833bd65c10e178797d2b656382da3835494b
diff --git a/.travis-libcxx-setup.sh b/.travis-libcxx-setup.sh
index 1b6b585..a591743 100644
--- a/.travis-libcxx-setup.sh
+++ b/.travis-libcxx-setup.sh
@@ -10,12 +10,18 @@
 git clone --depth=1 https://github.com/llvm-mirror/libcxx.git llvm-source/projects/libcxx
 git clone --depth=1 https://github.com/llvm-mirror/libcxxabi.git llvm-source/projects/libcxxabi
 
+# Setup libc++ options
+if [ -z "$BUILD_32_BITS" ]; then
+  export BUILD_32_BITS=OFF && echo disabling 32 bit build
+fi
+
 # Build and install libc++ (Use unstable ABI for better sanitizer coverage)
 mkdir llvm-build && cd llvm-build
 cmake -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${COMPILER} \
       -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_INSTALL_PREFIX=/usr \
       -DLIBCXX_ABI_UNSTABLE=ON \
       -DLLVM_USE_SANITIZER=${LIBCXX_SANITIZER} \
+      -DLLVM_BUILD_32_BITS=${BUILD_32_BITS} \
       ../llvm-source
 make cxx -j2
 sudo make install-cxxabi install-cxx
diff --git a/.travis.yml b/.travis.yml
index 19c68dd..36df088 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -6,81 +6,138 @@
   global:
     - /usr/local/bin:$PATH
 
-# NOTE: The COMPILER variable is unused. It simply makes the display on
-# travis-ci.org more readable.
 matrix:
-    include:
-        - compiler: gcc
-          addons:
-            apt:
-              packages:
-                - lcov
-          env: COMPILER=g++ C_COMPILER=gcc       BUILD_TYPE=Coverage
-        - compiler: gcc
-          env: COMPILER=g++ C_COMPILER=gcc       BUILD_TYPE=Debug
-        - compiler: gcc
-          env: COMPILER=g++ C_COMPILER=gcc       BUILD_TYPE=Release
-        - compiler: gcc
-          addons:
-            apt:
-              sources:
-                - ubuntu-toolchain-r-test
-              packages:
-                - g++-6
-          env:
-            - COMPILER=g++-6 C_COMPILER=gcc-6  BUILD_TYPE=Debug
-            - EXTRA_FLAGS="-fno-omit-frame-pointer -g -O2 -fsanitize=undefined,address -fuse-ld=gold"
-        - compiler: clang
-          env: COMPILER=clang++ C_COMPILER=clang BUILD_TYPE=Debug
-        - compiler: clang
-          env: COMPILER=clang++ C_COMPILER=clang BUILD_TYPE=Release
-        # Clang w/ libc++
-        - compiler: clang
-          addons:
-            apt:
-              packages:
-                clang-3.8
-          env:
-            - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
-            - LIBCXX_BUILD=1
-            - EXTRA_FLAGS="-stdlib=libc++"
-        # Clang w/ libc++, ASAN, UBSAN
-        - compiler: clang
-          addons:
-            apt:
-              packages:
-                clang-3.8
-          env:
-            - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
-            - LIBCXX_BUILD=1 LIBCXX_SANITIZER="Undefined;Address"
-            - EXTRA_FLAGS="-stdlib=libc++ -fno-omit-frame-pointer -g -O2 -fsanitize=undefined,address -fno-sanitize-recover=all"
-            - UBSAN_OPTIONS=print_stacktrace=1
-        # Clang w/ libc++ and MSAN
-        - compiler: clang
-          addons:
-            apt:
-              packages:
-                clang-3.8
-          env:
-            - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
-            - LIBCXX_BUILD=1 LIBCXX_SANITIZER=MemoryWithOrigins
-            - EXTRA_FLAGS="-stdlib=libc++ -g -O2 -fno-omit-frame-pointer -fsanitize=memory -fsanitize-memory-track-origins"
-        # Clang w/ libc++ and MSAN
-        - compiler: clang
-          addons:
-            apt:
-              packages:
-                clang-3.8
-          env:
-            - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=RelWithDebInfo
-            - LIBCXX_BUILD=1 LIBCXX_SANITIZER=Thread
-            - EXTRA_FLAGS="-stdlib=libc++ -g -O2 -fno-omit-frame-pointer -fsanitize=thread -fno-sanitize-recover=all"
+  include:
+    - compiler: gcc
+      addons:
+        apt:
+          packages:
+            - lcov
+      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Coverage
+    - compiler: gcc
+      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Debug
+    - compiler: gcc
+      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Release
+    - compiler: gcc
+      addons:
+        apt:
+          packages:
+            - g++-multilib
+      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Debug BUILD_32_BITS=ON
+    - compiler: gcc
+      addons:
+        apt:
+          packages:
+            - g++-multilib
+      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Release BUILD_32_BITS=ON
+    - compiler: gcc
+      addons:
+        apt:
+          sources:
+            - ubuntu-toolchain-r-test
+          packages:
+            - g++-6
+      env:
+        - COMPILER=g++-6 C_COMPILER=gcc-6  BUILD_TYPE=Debug
+        - EXTRA_FLAGS="-fno-omit-frame-pointer -g -O2 -fsanitize=undefined,address -fuse-ld=gold"
+    - compiler: clang
+      env: COMPILER=clang++ C_COMPILER=clang BUILD_TYPE=Debug
+    - compiler: clang
+      env: COMPILER=clang++ C_COMPILER=clang BUILD_TYPE=Release
+    # Clang w/ libc++
+    - compiler: clang
+      addons:
+        apt:
+          packages:
+            clang-3.8
+      env:
+        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
+        - LIBCXX_BUILD=1
+        - EXTRA_FLAGS="-stdlib=libc++"
+    - compiler: clang
+      addons:
+        apt:
+          packages:
+            clang-3.8
+      env:
+        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Release
+        - LIBCXX_BUILD=1
+        - EXTRA_FLAGS="-stdlib=libc++"
+    # Clang w/ 32bit libc++
+    - compiler: clang
+      addons:
+        apt:
+          packages:
+            - clang-3.8
+            - g++-multilib
+      env:
+        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
+        - LIBCXX_BUILD=1
+        - BUILD_32_BITS=ON
+        - EXTRA_FLAGS="-stdlib=libc++ -m32"
+    # Clang w/ 32bit libc++
+    - compiler: clang
+      addons:
+        apt:
+          packages:
+            - clang-3.8
+            - g++-multilib
+      env:
+        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Release
+        - LIBCXX_BUILD=1
+        - BUILD_32_BITS=ON
+        - EXTRA_FLAGS="-stdlib=libc++ -m32"
+    # Clang w/ libc++, ASAN, UBSAN
+    - compiler: clang
+      addons:
+        apt:
+          packages:
+            clang-3.8
+      env:
+        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
+        - LIBCXX_BUILD=1 LIBCXX_SANITIZER="Undefined;Address"
+        - EXTRA_FLAGS="-stdlib=libc++ -g -O2 -fno-omit-frame-pointer -fsanitize=undefined,address -fno-sanitize-recover=all"
+        - UBSAN_OPTIONS=print_stacktrace=1
+    # Clang w/ libc++ and MSAN
+    - compiler: clang
+      addons:
+        apt:
+          packages:
+            clang-3.8
+      env:
+        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
+        - LIBCXX_BUILD=1 LIBCXX_SANITIZER=MemoryWithOrigins
+        - EXTRA_FLAGS="-stdlib=libc++ -g -O2 -fno-omit-frame-pointer -fsanitize=memory -fsanitize-memory-track-origins"
+    # Clang w/ libc++ and MSAN
+    - compiler: clang
+      addons:
+        apt:
+          packages:
+            clang-3.8
+      env:
+        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=RelWithDebInfo
+        - LIBCXX_BUILD=1 LIBCXX_SANITIZER=Thread
+        - EXTRA_FLAGS="-stdlib=libc++ -g -O2 -fno-omit-frame-pointer -fsanitize=thread -fno-sanitize-recover=all"
+
+    - os: osx
+      osx_image: xcode8.3
+      compiler: clang
+      env:
+        - COMPILER=clang++ BUILD_TYPE=Debug
+    - os: osx
+      osx_image: xcode8.3
+      compiler: clang
+      env:
+        - COMPILER=clang++ BUILD_TYPE=Release
 
 before_script:
-    - if [ -n "${LIBCXX_BUILD}" ]; then
-        source .travis-libcxx-setup.sh;
-      fi
-    - mkdir build && cd build
+  - if [ -z "$BUILD_32_BITS" ]; then
+      export BUILD_32_BITS=OFF && echo disabling 32 bit build;
+    fi
+  - if [ -n "${LIBCXX_BUILD}" ]; then
+      source .travis-libcxx-setup.sh;
+    fi
+  - mkdir build && cd build
 
 install:
   - if [ "${BUILD_TYPE}" == "Coverage" -a "${TRAVIS_OS_NAME}" == "linux" ]; then
@@ -90,13 +147,11 @@
     fi
 
 script:
-    - cmake -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${COMPILER} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DCMAKE_CXX_FLAGS="${EXTRA_FLAGS}" ..
-    - make
-    - make CTEST_OUTPUT_ON_FAILURE=1 test
+  - cmake -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${COMPILER} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DCMAKE_CXX_FLAGS="${EXTRA_FLAGS}" -DBENCHMARK_BUILD_32_BITS=${BUILD_32_BITS} ..
+  - make
+  - ctest -C ${BUILD_TYPE} --output-on-failure
 
 after_success:
   - if [ "${BUILD_TYPE}" == "Coverage" -a "${TRAVIS_OS_NAME}" == "linux" ]; then
       coveralls --include src --include include --gcov-options '\-lp' --root .. --build-root .;
     fi
-
-sudo: required
diff --git a/AUTHORS b/AUTHORS
index 5a545fa..ae278df 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -18,17 +18,22 @@
 Evgeny Safronov <division494@gmail.com>
 Felix Homann <linuxaudio@showlabor.de>
 Google Inc.
+International Business Machines Corporation
 Ismael Jimenez Martinez <ismael.jimenez.martinez@gmail.com>
+Jern-Kuan Leong <jernkuan@gmail.com>
+Joao Paulo Magalhaes <joaoppmagalhaes@gmail.com>
 JianXiong Zhou <zhoujianxiong2@gmail.com>
 Jussi Knuuttila <jussi.knuuttila@gmail.com>
 Kaito Udagawa <umireon@gmail.com>
 Lei Xu <eddyxu@gmail.com>
 Matt Clarkson <mattyclarkson@gmail.com>
+Maxim Vafin <maxvafin@gmail.com>
 Nick Hutchinson <nshutchinson@gmail.com>
 Oleksandr Sochka <sasha.sochka@gmail.com>
 Paul Redmond <paul.redmond@gmail.com>
 Radoslav Yovchev <radoslav.tm@gmail.com>
 Shuo Chen <chenshuo@chenshuo.com>
+Yixuan Qiu <yixuanq@gmail.com>
 Yusuke Suzuki <utatane.tea@gmail.com>
 Dirac Research 
 Zbigniew Skowron <zbychs@gmail.com>
diff --git a/Android.bp b/Android.bp
index ec822e5..4b25e3e 100644
--- a/Android.bp
+++ b/Android.bp
@@ -35,6 +35,7 @@
         "src/commandlineflags.cc",
         "src/complexity.cc",
         "src/console_reporter.cc",
+        "src/counter.cc",
         "src/csv_reporter.cc",
         "src/json_reporter.cc",
         "src/reporter.cc",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8bfd21b..f7f1566 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,5 @@
-cmake_minimum_required (VERSION 2.8.11)
+cmake_minimum_required (VERSION 2.8.12)
+
 project (benchmark)
 
 foreach(p
@@ -11,8 +12,11 @@
 endforeach()
 
 option(BENCHMARK_ENABLE_TESTING "Enable testing of the benchmark library." ON)
+option(BENCHMARK_ENABLE_EXCEPTIONS "Enable the use of exceptions in the benchmark library." ON)
 option(BENCHMARK_ENABLE_LTO "Enable link time optimisation of the benchmark library." OFF)
 option(BENCHMARK_USE_LIBCXX "Build and test using libc++ as the standard library." OFF)
+option(BENCHMARK_BUILD_32_BITS "Build a 32 bit version of the library" OFF)
+
 # Make sure we can import out CMake functions
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 
@@ -33,12 +37,20 @@
 include(AddCXXCompilerFlag)
 include(CXXFeatureCheck)
 
+if (BENCHMARK_BUILD_32_BITS)
+  add_required_cxx_compiler_flag(-m32)
+endif()
+
 if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
   # Turn compiler warnings up to 11
   string(REGEX REPLACE "[-/]W[1-4]" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4")
   add_definitions(-D_CRT_SECURE_NO_WARNINGS)
 
+  if (NOT BENCHMARK_ENABLE_EXCEPTIONS)
+    add_cxx_compiler_flag(-EHs-)
+    add_cxx_compiler_flag(-EHa-)
+  endif()
   # Link time optimisation
   if (BENCHMARK_ENABLE_LTO)
     set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /GL")
@@ -80,12 +92,20 @@
   add_cxx_compiler_flag(-Wshorten-64-to-32)
   add_cxx_compiler_flag(-Wfloat-equal)
   add_cxx_compiler_flag(-fstrict-aliasing)
+  if (NOT BENCHMARK_ENABLE_EXCEPTIONS)
+    add_cxx_compiler_flag(-fno-exceptions)
+  endif()
   if (NOT BENCHMARK_USE_LIBCXX)
     add_cxx_compiler_flag(-Wzero-as-null-pointer-constant)
   endif()
   if (HAVE_CXX_FLAG_FSTRICT_ALIASING)
-    add_cxx_compiler_flag(-Wstrict-aliasing)
+    if (NOT CMAKE_CXX_COMPILER_ID STREQUAL "Intel") #ICC17u2: Many false positives for Wstrict-aliasing
+      add_cxx_compiler_flag(-Wstrict-aliasing)
+    endif()
   endif()
+  # ICC17u2: overloaded virtual function "benchmark::Fixture::SetUp" is only partially overridden
+  # (because of deprecated overload)
+  add_cxx_compiler_flag(-wd654)  
   add_cxx_compiler_flag(-Wthread-safety)
   if (HAVE_CXX_FLAG_WTHREAD_SAFETY)
     cxx_feature_check(THREAD_SAFETY_ATTRIBUTES)
@@ -162,7 +182,10 @@
 if(NOT HAVE_STD_REGEX AND NOT HAVE_GNU_POSIX_REGEX AND NOT HAVE_POSIX_REGEX)
   message(FATAL_ERROR "Failed to determine the source files for the regular expression backend")
 endif()
-
+if (NOT BENCHMARK_ENABLE_EXCEPTIONS AND HAVE_STD_REGEX
+        AND NOT HAVE_GNU_POSIX_REGEX AND NOT HAVE_POSIX_REGEX)
+  message(WARNING "Using std::regex with exceptions disabled is not fully supported")
+endif()
 cxx_feature_check(STEADY_CLOCK)
 # Ensure we have pthreads
 find_package(Threads REQUIRED)
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
index 33cd941..58f81fd 100644
--- a/CONTRIBUTORS
+++ b/CONTRIBUTORS
@@ -34,19 +34,24 @@
 Evgeny Safronov <division494@gmail.com>
 Felix Homann <linuxaudio@showlabor.de>
 Ismael Jimenez Martinez <ismael.jimenez.martinez@gmail.com>
+Jern-Kuan Leong <jernkuan@gmail.com>
+Joao Paulo Magalhaes <joaoppmagalhaes@gmail.com>
 JianXiong Zhou <zhoujianxiong2@gmail.com>
 Jussi Knuuttila <jussi.knuuttila@gmail.com>
 Kaito Udagawa <umireon@gmail.com>
 Kai Wolf <kai.wolf@gmail.com>
 Lei Xu <eddyxu@gmail.com>
 Matt Clarkson <mattyclarkson@gmail.com>
+Maxim Vafin <maxvafin@gmail.com>
 Nick Hutchinson <nshutchinson@gmail.com>
 Oleksandr Sochka <sasha.sochka@gmail.com>
 Pascal Leroy <phl@google.com>
 Paul Redmond <paul.redmond@gmail.com>
 Pierre Phaneuf <pphaneuf@google.com>
 Radoslav Yovchev <radoslav.tm@gmail.com>
+Ray Glover <ray.glover@uk.ibm.com>
 Shuo Chen <chenshuo@chenshuo.com>
+Yixuan Qiu <yixuanq@gmail.com>
 Yusuke Suzuki <utatane.tea@gmail.com>
 Tobias Ulvgård <tobias.ulvgard@dirac.se>
 Zbigniew Skowron <zbychs@gmail.com>
diff --git a/README.md b/README.md
index 9109430..2430d93 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,8 @@
 
 [Known issues and common problems](#known-issues)
 
+[Additional Tooling Documentation](docs/tools.md)
+
 ## Example usage
 ### Basic usage
 Define a function that executes the code to be measured.
@@ -363,7 +365,7 @@
 }
 ```
 
-Note that `ClobberMemory()` is only available for GNU based compilers.
+Note that `ClobberMemory()` is only available for GNU or MSVC based compilers.
 
 ### Set time unit manually
 If a benchmark runs a few milliseconds it may be hard to visually compare the
@@ -430,6 +432,132 @@
 /* BarTest is now registered */
 ```
 
+
+## User-defined counters
+
+You can add your own counters with user-defined names. The example below
+will add columns "Foo", "Bar" and "Baz" in its output:
+
+```c++
+static void UserCountersExample1(benchmark::State& state) {
+  double numFoos = 0, numBars = 0, numBazs = 0;
+  while (state.KeepRunning()) {
+    // ... count Foo,Bar,Baz events
+  }
+  state.counters["Foo"] = numFoos;
+  state.counters["Bar"] = numBars;
+  state.counters["Baz"] = numBazs;
+}
+```
+
+The `state.counters` object is a `std::map` with `std::string` keys
+and `Counter` values. The latter is a `double`-like class, via an implicit
+conversion to `double&`. Thus you can use all of the standard arithmetic
+assignment operators (`=,+=,-=,*=,/=`) to change the value of each counter.
+
+In multithreaded benchmarks, each counter is set on the calling thread only.
+When the benchmark finishes, the counters from each thread will be summed;
+the resulting sum is the value which will be shown for the benchmark.
+
+The `Counter` constructor accepts two parameters: the value as a `double`
+and a bit flag which allows you to show counters as rates and/or as
+per-thread averages:
+
+```c++
+  // sets a simple counter
+  state.counters["Foo"] = numFoos;
+
+  // Set the counter as a rate. It will be presented divided
+  // by the duration of the benchmark.
+  state.counters["FooRate"] = Counter(numFoos, benchmark::Counter::kIsRate);
+
+  // Set the counter as a thread-average quantity. It will
+  // be presented divided by the number of threads.
+  state.counters["FooAvg"] = Counter(numFoos, benchmark::Counter::kAvgThreads);
+
+  // There's also a combined flag:
+  state.counters["FooAvgRate"] = Counter(numFoos,benchmark::Counter::kAvgThreadsRate);
+```
+
+When you're compiling in C++11 mode or later you can use `insert()` with
+`std::initializer_list`:
+
+```c++
+  // With C++11, this can be done:
+  state.counters.insert({{"Foo", numFoos}, {"Bar", numBars}, {"Baz", numBazs}});
+  // ... instead of:
+  state.counters["Foo"] = numFoos;
+  state.counters["Bar"] = numBars;
+  state.counters["Baz"] = numBazs;
+```
+
+### Counter reporting
+
+When using the console reporter, by default, user counters are are printed at
+the end after the table, the same way as ``bytes_processed`` and
+``items_processed``. This is best for cases in which there are few counters,
+or where there are only a couple of lines per benchmark. Here's an example of
+the default output:
+
+```
+------------------------------------------------------------------------------
+Benchmark                        Time           CPU Iterations UserCounters...
+------------------------------------------------------------------------------
+BM_UserCounter/threads:8      2248 ns      10277 ns      68808 Bar=16 Bat=40 Baz=24 Foo=8
+BM_UserCounter/threads:1      9797 ns       9788 ns      71523 Bar=2 Bat=5 Baz=3 Foo=1024m
+BM_UserCounter/threads:2      4924 ns       9842 ns      71036 Bar=4 Bat=10 Baz=6 Foo=2
+BM_UserCounter/threads:4      2589 ns      10284 ns      68012 Bar=8 Bat=20 Baz=12 Foo=4
+BM_UserCounter/threads:8      2212 ns      10287 ns      68040 Bar=16 Bat=40 Baz=24 Foo=8
+BM_UserCounter/threads:16     1782 ns      10278 ns      68144 Bar=32 Bat=80 Baz=48 Foo=16
+BM_UserCounter/threads:32     1291 ns      10296 ns      68256 Bar=64 Bat=160 Baz=96 Foo=32
+BM_UserCounter/threads:4      2615 ns      10307 ns      68040 Bar=8 Bat=20 Baz=12 Foo=4
+BM_Factorial                    26 ns         26 ns   26608979 40320
+BM_Factorial/real_time          26 ns         26 ns   26587936 40320
+BM_CalculatePiRange/1           16 ns         16 ns   45704255 0
+BM_CalculatePiRange/8           73 ns         73 ns    9520927 3.28374
+BM_CalculatePiRange/64         609 ns        609 ns    1140647 3.15746
+BM_CalculatePiRange/512       4900 ns       4901 ns     142696 3.14355
+```
+
+If this doesn't suit you, you can print each counter as a table column by
+passing the flag `--benchmark_counters_tabular=true` to the benchmark
+application. This is best for cases in which there are a lot of counters, or
+a lot of lines per individual benchmark. Note that this will trigger a
+reprinting of the table header any time the counter set changes between
+individual benchmarks. Here's an example of corresponding output when
+`--benchmark_counters_tabular=true` is passed:
+
+```
+---------------------------------------------------------------------------------------
+Benchmark                        Time           CPU Iterations    Bar   Bat   Baz   Foo
+---------------------------------------------------------------------------------------
+BM_UserCounter/threads:8      2198 ns       9953 ns      70688     16    40    24     8
+BM_UserCounter/threads:1      9504 ns       9504 ns      73787      2     5     3     1
+BM_UserCounter/threads:2      4775 ns       9550 ns      72606      4    10     6     2
+BM_UserCounter/threads:4      2508 ns       9951 ns      70332      8    20    12     4
+BM_UserCounter/threads:8      2055 ns       9933 ns      70344     16    40    24     8
+BM_UserCounter/threads:16     1610 ns       9946 ns      70720     32    80    48    16
+BM_UserCounter/threads:32     1192 ns       9948 ns      70496     64   160    96    32
+BM_UserCounter/threads:4      2506 ns       9949 ns      70332      8    20    12     4
+--------------------------------------------------------------
+Benchmark                        Time           CPU Iterations
+--------------------------------------------------------------
+BM_Factorial                    26 ns         26 ns   26392245 40320
+BM_Factorial/real_time          26 ns         26 ns   26494107 40320
+BM_CalculatePiRange/1           15 ns         15 ns   45571597 0
+BM_CalculatePiRange/8           74 ns         74 ns    9450212 3.28374
+BM_CalculatePiRange/64         595 ns        595 ns    1173901 3.15746
+BM_CalculatePiRange/512       4752 ns       4752 ns     147380 3.14355
+BM_CalculatePiRange/4k       37970 ns      37972 ns      18453 3.14184
+BM_CalculatePiRange/32k     303733 ns     303744 ns       2305 3.14162
+BM_CalculatePiRange/256k   2434095 ns    2434186 ns        288 3.1416
+BM_CalculatePiRange/1024k  9721140 ns    9721413 ns         71 3.14159
+BM_CalculatePi/threads:8      2255 ns       9943 ns      70936
+```
+Note above the additional header printed when the benchmark changes from
+``BM_UserCounter`` to ``BM_Factorial``. This is because ``BM_Factorial`` does
+not have the same counter set as ``BM_UserCounter``.
+
 ## Exiting Benchmarks in Error
 
 When errors caused by external influences, such as file I/O and network
@@ -501,7 +629,7 @@
 information about the CPU and the date.
 The `benchmarks` attribute contains a list of ever benchmark run. Example json
 output looks like:
-``` json
+```json
 {
   "context": {
     "date": "2015/03/17-18:40:25",
@@ -582,6 +710,7 @@
 * GCC 4.8
 * Clang 3.4
 * Visual Studio 2013
+* Intel 2015 Update 1
 
 Anything older *may* work.
 
diff --git a/appveyor.yml b/appveyor.yml
index 204f30d..e084f38 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -1,16 +1,18 @@
 version: '{build}'
 
+image: Visual Studio 2017
+
 configuration:
   - Debug
   - Release
 
 environment:
   matrix:
-    - compiler: msvc-12-seh
-      generator: "Visual Studio 12 2013"
+    - compiler: msvc-15-seh
+      generator: "Visual Studio 15 2017"
 
-    - compiler: msvc-12-seh
-      generator: "Visual Studio 12 2013 Win64"
+    - compiler: msvc-15-seh
+      generator: "Visual Studio 15 2017 Win64"
 
     - compiler: msvc-14-seh
       generator: "Visual Studio 14 2015"
@@ -18,9 +20,16 @@
     - compiler: msvc-14-seh
       generator: "Visual Studio 14 2015 Win64"
 
+    - compiler: msvc-12-seh
+      generator: "Visual Studio 12 2013"
+
+    - compiler: msvc-12-seh
+      generator: "Visual Studio 12 2013 Win64"
+
     - compiler: gcc-5.3.0-posix
       generator: "MinGW Makefiles"
       cxx_path: 'C:\mingw-w64\i686-5.3.0-posix-dwarf-rt_v4-rev0\mingw32\bin'
+      APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
 
 matrix:
   fast_finish: true
@@ -30,12 +39,6 @@
   - if "%generator%"=="MinGW Makefiles" (set "PATH=%PATH:C:\Program Files\Git\usr\bin;=%")
   - if not "%cxx_path%"=="" (set "PATH=%PATH%;%cxx_path%")
 
-# TODO Remove this. This is a hack to work around bogus warning messages
-# See http://goo.gl/euguBI for more information.
-before_build:
-  - del "C:\Program Files (x86)\MSBuild\14.0\Microsoft.Common.targets\ImportAfter\Xamarin.Common.targets"
-  - del "C:\Program Files (x86)\MSBuild\12.0\Microsoft.Common.targets\ImportAfter\Xamarin.Common.targets"
-
 build_script:
   - md _build -Force
   - cd _build
@@ -51,4 +54,3 @@
     name: logs
   - path: '_build/Testing/**/*.xml'
     name: test_results
-
diff --git a/cmake/AddCXXCompilerFlag.cmake b/cmake/AddCXXCompilerFlag.cmake
index 9afde84..0b176ba 100644
--- a/cmake/AddCXXCompilerFlag.cmake
+++ b/cmake/AddCXXCompilerFlag.cmake
@@ -19,14 +19,21 @@
 
 include(CheckCXXCompilerFlag)
 
-function(add_cxx_compiler_flag FLAG)
+function(mangle_compiler_flag FLAG OUTPUT)
   string(TOUPPER "HAVE_CXX_FLAG_${FLAG}" SANITIZED_FLAG)
   string(REPLACE "+" "X" SANITIZED_FLAG ${SANITIZED_FLAG})
   string(REGEX REPLACE "[^A-Za-z_0-9]" "_" SANITIZED_FLAG ${SANITIZED_FLAG})
   string(REGEX REPLACE "_+" "_" SANITIZED_FLAG ${SANITIZED_FLAG})
-  set(CMAKE_REQUIRED_FLAGS "${FLAG}")
-  check_cxx_compiler_flag("${FLAG}" ${SANITIZED_FLAG})
-  if(${SANITIZED_FLAG})
+  set(${OUTPUT} "${SANITIZED_FLAG}" PARENT_SCOPE)
+endfunction(mangle_compiler_flag)
+
+function(add_cxx_compiler_flag FLAG)
+  mangle_compiler_flag("${FLAG}" MANGLED_FLAG)
+  set(OLD_CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}")
+  set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${FLAG}")
+  check_cxx_compiler_flag("${FLAG}" ${MANGLED_FLAG})
+  set(CMAKE_REQUIRED_FLAGS "${OLD_CMAKE_REQUIRED_FLAGS}")
+  if(${MANGLED_FLAG})
     set(VARIANT ${ARGV1})
     if(ARGV1)
       string(TOUPPER "_${VARIANT}" VARIANT)
@@ -35,3 +42,23 @@
   endif()
 endfunction()
 
+function(add_required_cxx_compiler_flag FLAG)
+  mangle_compiler_flag("${FLAG}" MANGLED_FLAG)
+  set(OLD_CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}")
+  set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${FLAG}")
+  check_cxx_compiler_flag("${FLAG}" ${MANGLED_FLAG})
+  set(CMAKE_REQUIRED_FLAGS "${OLD_CMAKE_REQUIRED_FLAGS}")
+  if(${MANGLED_FLAG})
+    set(VARIANT ${ARGV1})
+    if(ARGV1)
+      string(TOUPPER "_${VARIANT}" VARIANT)
+    endif()
+    set(CMAKE_CXX_FLAGS${VARIANT} "${CMAKE_CXX_FLAGS${VARIANT}} ${FLAG}" PARENT_SCOPE)
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${FLAG}" PARENT_SCOPE)
+    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${FLAG}" PARENT_SCOPE)
+    set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} ${FLAG}" PARENT_SCOPE)
+    set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${FLAG}" PARENT_SCOPE)
+  else()
+    message(FATAL_ERROR "Required flag '${FLAG}' is not supported by the compiler")
+  endif()
+endfunction()
diff --git a/cmake/CXXFeatureCheck.cmake b/cmake/CXXFeatureCheck.cmake
index b106f32..2c4460f 100644
--- a/cmake/CXXFeatureCheck.cmake
+++ b/cmake/CXXFeatureCheck.cmake
@@ -10,7 +10,7 @@
 #
 # include(CXXFeatureCheck)
 # cxx_feature_check(STD_REGEX)
-# Requires CMake 2.6+
+# Requires CMake 2.8.12+
 
 if(__cxx_feature_check)
   return()
@@ -22,6 +22,8 @@
   string(TOUPPER ${FILE} VAR)
   string(TOUPPER "HAVE_${VAR}" FEATURE)
   if (DEFINED HAVE_${VAR})
+    set(HAVE_${VAR} 1 CACHE INTERNAL "Feature test for ${FILE}" PARENT_SCOPE)
+    add_definitions(-DHAVE_${VAR})
     return()
   endif()
   message("-- Performing Test ${FEATURE}")
diff --git a/cmake/Config.cmake.in b/cmake/Config.cmake.in
new file mode 100644
index 0000000..6e9256e
--- /dev/null
+++ b/cmake/Config.cmake.in
@@ -0,0 +1 @@
+include("${CMAKE_CURRENT_LIST_DIR}/@targets_export_name@.cmake")
diff --git a/docs/tools.md b/docs/tools.md
new file mode 100644
index 0000000..f176f74
--- /dev/null
+++ b/docs/tools.md
@@ -0,0 +1,59 @@
+# Benchmark Tools
+
+## compare_bench.py
+
+The `compare_bench.py` utility which can be used to compare the result of benchmarks.
+The program is invoked like:
+
+``` bash
+$ compare_bench.py <old-benchmark> <new-benchmark> [benchmark options]...
+```
+
+Where `<old-benchmark>` and `<new-benchmark>` either specify a benchmark executable file, or a JSON output file. The type of the input file is automatically detected. If a benchmark executable is specified then the benchmark is run to obtain the results. Otherwise the results are simply loaded from the output file.
+
+The sample output using the JSON test files under `Inputs/` gives:
+
+``` bash
+$ ./compare_bench.py ./gbench/Inputs/test1_run1.json ./gbench/Inputs/test1_run2.json
+Comparing ./gbench/Inputs/test1_run1.json to ./gbench/Inputs/test1_run2.json
+Benchmark                   Time           CPU
+----------------------------------------------
+BM_SameTimes               +0.00         +0.00
+BM_2xFaster                -0.50         -0.50
+BM_2xSlower                +1.00         +1.00
+BM_10PercentFaster         -0.10         -0.10
+BM_10PercentSlower         +0.10         +0.10
+```
+
+When a benchmark executable is run, the raw output from the benchmark is printed in real time to stdout. The sample output using `benchmark/basic_test` for both arguments looks like:
+
+```
+./compare_bench.py  test/basic_test test/basic_test  --benchmark_filter=BM_empty.*
+RUNNING: test/basic_test --benchmark_filter=BM_empty.*
+Run on (4 X 4228.32 MHz CPU s)
+2016-08-02 19:21:33
+Benchmark                              Time           CPU Iterations
+--------------------------------------------------------------------
+BM_empty                               9 ns          9 ns   79545455
+BM_empty/threads:4                     4 ns          9 ns   75268816
+BM_empty_stop_start                    8 ns          8 ns   83333333
+BM_empty_stop_start/threads:4          3 ns          8 ns   83333332
+RUNNING: test/basic_test --benchmark_filter=BM_empty.*
+Run on (4 X 4228.32 MHz CPU s)
+2016-08-02 19:21:35
+Benchmark                              Time           CPU Iterations
+--------------------------------------------------------------------
+BM_empty                               9 ns          9 ns   76086957
+BM_empty/threads:4                     4 ns          9 ns   76086956
+BM_empty_stop_start                    8 ns          8 ns   87500000
+BM_empty_stop_start/threads:4          3 ns          8 ns   88607596
+Comparing test/basic_test to test/basic_test
+Benchmark                              Time           CPU
+---------------------------------------------------------
+BM_empty                              +0.00         +0.00
+BM_empty/threads:4                    +0.00         +0.00
+BM_empty_stop_start                   +0.00         +0.00
+BM_empty_stop_start/threads:4         +0.00         +0.00
+```
+
+Obviously this example doesn't give any useful output, but it's intended to show the output format when 'compare_bench.py' needs to run benchmarks.
diff --git a/include/benchmark/benchmark_api.h b/include/benchmark/benchmark_api.h
index 28baa58..7f9a58f 100644
--- a/include/benchmark/benchmark_api.h
+++ b/include/benchmark/benchmark_api.h
@@ -155,19 +155,29 @@
 
 #include <string>
 #include <vector>
+#include <map>
 
 #include "macros.h"
 
 #if defined(BENCHMARK_HAS_CXX11)
 #include <type_traits>
+#include <initializer_list>
 #include <utility>
 #endif
 
+#if defined(_MSC_VER)
+#include <intrin.h> // for _ReadWriteBarrier
+#endif
+
 namespace benchmark {
 class BenchmarkReporter;
 
 void Initialize(int* argc, char** argv);
 
+// Report to stdout all arguments in 'argv' as unrecognized except the first.
+// Returns true there is at least on unrecognized argument (i.e. 'argc' > 1).
+bool ReportUnrecognizedArguments(int argc, char** argv);
+
 // Generate a list of benchmarks matching the specified --benchmark_filter flag
 // and if --benchmark_list_tests is specified return after printing the name
 // of each matching benchmark. Otherwise run each matching benchmark and
@@ -197,19 +207,6 @@
 class BenchmarkImp;
 class BenchmarkFamilies;
 
-template <class T>
-struct Voider {
-  typedef void type;
-};
-
-template <class T, class = void>
-struct EnableIfString {};
-
-template <class T>
-struct EnableIfString<T, typename Voider<typename T::basic_string>::type> {
-  typedef int type;
-};
-
 void UseCharPointer(char const volatile*);
 
 // Take ownership of the pointer and register the benchmark. Return the
@@ -222,28 +219,82 @@
 
 }  // end namespace internal
 
+
+#if !defined(__GNUC__) || defined(__pnacl__) || defined(EMSCRIPTN)
+# define BENCHMARK_HAS_NO_INLINE_ASSEMBLY
+#endif
+
 // The DoNotOptimize(...) function can be used to prevent a value or
 // expression from being optimized away by the compiler. This function is
 // intended to add little to no overhead.
 // See: https://youtu.be/nXaxk27zwlk?t=2441
-#if defined(__GNUC__)
+#ifndef BENCHMARK_HAS_NO_INLINE_ASSEMBLY
 template <class Tp>
 inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
+  // Clang doesn't like the 'X' constraint on `value` and certain GCC versions
+  // don't like the 'g' constraint. Attempt to placate them both.
+#if defined(__clang__)
   asm volatile("" : : "g"(value) : "memory");
+#else
+  asm volatile("" : : "i,r,m"(value) : "memory");
+#endif
 }
 // Force the compiler to flush pending writes to global memory. Acts as an
 // effective read/write barrier
 inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() {
   asm volatile("" : : : "memory");
 }
+#elif defined(_MSC_VER)
+template <class Tp>
+inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
+  internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value));
+  _ReadWriteBarrier();
+}
+
+inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() {
+  _ReadWriteBarrier();
+}
 #else
 template <class Tp>
 inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
   internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value));
 }
-// FIXME Add ClobberMemory() for non-gnu compilers
+// FIXME Add ClobberMemory() for non-gnu and non-msvc compilers
 #endif
 
+
+
+// This class is used for user-defined counters.
+class Counter {
+public:
+
+  enum Flags {
+    kDefaults   = 0,
+    // Mark the counter as a rate. It will be presented divided
+    // by the duration of the benchmark.
+    kIsRate     = 1,
+    // Mark the counter as a thread-average quantity. It will be
+    // presented divided by the number of threads.
+    kAvgThreads = 2,
+    // Mark the counter as a thread-average rate. See above.
+    kAvgThreadsRate = kIsRate|kAvgThreads
+  };
+
+  double value;
+  Flags  flags;
+
+  BENCHMARK_ALWAYS_INLINE
+  Counter(double v = 0., Flags f = kDefaults) : value(v), flags(f) {}
+
+  BENCHMARK_ALWAYS_INLINE operator double const& () const { return value; }
+  BENCHMARK_ALWAYS_INLINE operator double      & ()       { return value; }
+
+};
+
+// This is the container for the user-defined counters.
+typedef std::map<std::string, Counter> UserCounters;
+
+
 // TimeUnit is passed to a benchmark in order to specify the order of magnitude
 // for the measured time.
 enum TimeUnit { kNanosecond, kMicrosecond, kMillisecond };
@@ -393,13 +444,7 @@
   // REQUIRES: a benchmark has exited its KeepRunning loop.
   void SetLabel(const char* label);
 
-  // Allow the use of std::string without actually including <string>.
-  // This function does not participate in overload resolution unless StringType
-  // has the nested typename `basic_string`. This typename should be provided
-  // as an injected class name in the case of std::string.
-  template <class StringType>
-  void SetLabel(StringType const& str,
-                typename internal::EnableIfString<StringType>::type = 1) {
+  void BENCHMARK_ALWAYS_INLINE SetLabel(const std::string& str) {
     this->SetLabel(str.c_str());
   }
 
@@ -434,6 +479,8 @@
   bool error_occurred_;
 
  public:
+  // Container for user-defined counters.
+  UserCounters counters;
   // Index of the executing thread. Values from [0, threads).
   const int thread_index;
   // Number of threads concurrently executing the benchmark.
@@ -536,9 +583,17 @@
 
   // Set the minimum amount of time to use when running this benchmark. This
   // option overrides the `benchmark_min_time` flag.
-  // REQUIRES: `t > 0`
+  // REQUIRES: `t > 0` and `Iterations` has not been called on this benchmark.
   Benchmark* MinTime(double t);
 
+  // Specify the amount of iterations that should be run by this benchmark.
+  // REQUIRES: 'n > 0' and `MinTime` has not been called on this benchmark.
+  //
+  // NOTE: This function should only be used when *exact* iteration control is
+  //   needed and never to control or limit how long a benchmark runs, where
+  // `--benchmark_min_time=N` or `MinTime(...)` should be used instead.
+  Benchmark* Iterations(size_t n);
+
   // Specify the amount of times to repeat this benchmark. This option overrides
   // the `benchmark_repetitions` flag.
   // REQUIRES: `n > 0`
@@ -627,6 +682,7 @@
   TimeUnit time_unit_;
   int range_multiplier_;
   double min_time_;
+  size_t iterations_;
   int repetitions_;
   bool use_real_time_;
   bool use_manual_time_;
@@ -651,6 +707,10 @@
 internal::Benchmark* RegisterBenchmark(const char* name, Lambda&& fn);
 #endif
 
+// Remove all registered benchmarks. All pointers to previously registered
+// benchmarks are invalidated.
+void ClearRegisteredBenchmarks();
+
 namespace internal {
 // The class used to hold all Benchmarks created from static function.
 // (ie those created using the BENCHMARK(...) macros.
@@ -858,6 +918,7 @@
 #define BENCHMARK_MAIN()                   \
   int main(int argc, char** argv) {        \
     ::benchmark::Initialize(&argc, argv);  \
+    if (::benchmark::ReportUnrecognizedArguments(argc, argv)) return 1; \
     ::benchmark::RunSpecifiedBenchmarks(); \
   }
 
diff --git a/include/benchmark/reporter.h b/include/benchmark/reporter.h
index 8c39e7f..8821ebf 100644
--- a/include/benchmark/reporter.h
+++ b/include/benchmark/reporter.h
@@ -19,6 +19,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+#include <set>
 
 #include "benchmark_api.h"  // For forward declaration of BenchmarkReporter
 
@@ -54,7 +55,8 @@
           complexity_lambda(),
           complexity_n(0),
           report_big_o(false),
-          report_rms(false) {}
+          report_rms(false),
+          counters() {}
 
     std::string benchmark_name;
     std::string report_label;  // Empty if not set by benchmark.
@@ -93,6 +95,8 @@
     // Inform print function whether the current run is a complexity report
     bool report_big_o;
     bool report_rms;
+
+    UserCounters counters;
   };
 
   // Construct a BenchmarkReporter with the output stream set to 'std::cout'
@@ -153,20 +157,29 @@
 // Simple reporter that outputs benchmark data to the console. This is the
 // default reporter used by RunSpecifiedBenchmarks().
 class ConsoleReporter : public BenchmarkReporter {
- public:
-  enum OutputOptions { OO_None, OO_Color };
-  explicit ConsoleReporter(OutputOptions color_output = OO_Color)
-      : name_field_width_(0), color_output_(color_output == OO_Color) {}
+public:
+  enum OutputOptions {
+    OO_None = 0,
+    OO_Color = 1,
+    OO_Tabular = 2,
+    OO_ColorTabular = OO_Color|OO_Tabular,
+    OO_Defaults = OO_ColorTabular
+  };
+  explicit ConsoleReporter(OutputOptions opts_ = OO_Defaults)
+      : output_options_(opts_), name_field_width_(0),
+        prev_counters_(), printed_header_(false) {}
 
   virtual bool ReportContext(const Context& context);
   virtual void ReportRuns(const std::vector<Run>& reports);
 
  protected:
   virtual void PrintRunData(const Run& report);
-  size_t name_field_width_;
+  virtual void PrintHeader(const Run& report);
 
- private:
-  bool color_output_;
+  OutputOptions output_options_;
+  size_t name_field_width_;
+  UserCounters prev_counters_;
+  bool printed_header_;
 };
 
 class JSONReporter : public BenchmarkReporter {
@@ -184,11 +197,15 @@
 
 class CSVReporter : public BenchmarkReporter {
  public:
+  CSVReporter() : printed_header_(false) {}
   virtual bool ReportContext(const Context& context);
   virtual void ReportRuns(const std::vector<Run>& reports);
 
  private:
   void PrintRunData(const Run& report);
+
+  bool printed_header_;
+  std::set< std::string > user_counter_names_;
 };
 
 inline const char* GetTimeUnitString(TimeUnit unit) {
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 4038875..244484b 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -18,27 +18,61 @@
   VERSION ${GENERIC_LIB_VERSION}
   SOVERSION ${GENERIC_LIB_SOVERSION}
 )
+target_include_directories(benchmark PUBLIC
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include>
+    )
 
 # Link threads.
 target_link_libraries(benchmark  ${BENCHMARK_CXX_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
+find_library(LIBRT rt)
+if(LIBRT)
+  target_link_libraries(benchmark ${LIBRT})
+endif()
 
 # We need extra libraries on Windows
 if(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
   target_link_libraries(benchmark Shlwapi)
 endif()
 
-# Expose public API
-target_include_directories(benchmark PUBLIC ${PROJECT_SOURCE_DIR}/include)
+set(include_install_dir "include")
+set(lib_install_dir "lib/")
+set(bin_install_dir "bin/")
+set(config_install_dir "lib/cmake/${PROJECT_NAME}")
+
+set(generated_dir "${CMAKE_CURRENT_BINARY_DIR}/generated")
+
+set(version_config "${generated_dir}/${PROJECT_NAME}ConfigVersion.cmake")
+set(project_config "${generated_dir}/${PROJECT_NAME}Config.cmake")
+set(targets_export_name "${PROJECT_NAME}Targets")
+
+set(namespace "${PROJECT_NAME}::")
+
+include(CMakePackageConfigHelpers)
+write_basic_package_version_file(
+    "${version_config}" VERSION ${GIT_VERSION} COMPATIBILITY SameMajorVersion
+)
+
+configure_file("${PROJECT_SOURCE_DIR}/cmake/Config.cmake.in" "${project_config}" @ONLY)
 
 # Install target (will install the library to specified CMAKE_INSTALL_PREFIX variable)
 install(
   TARGETS benchmark
-  ARCHIVE DESTINATION lib
-  LIBRARY DESTINATION lib
-  RUNTIME DESTINATION bin
-  COMPONENT library)
+  EXPORT ${targets_export_name}
+  ARCHIVE DESTINATION ${lib_install_dir}
+  LIBRARY DESTINATION ${lib_install_dir}
+  RUNTIME DESTINATION ${bin_install_dir}
+  INCLUDES DESTINATION ${include_install_dir})
 
 install(
   DIRECTORY "${PROJECT_SOURCE_DIR}/include/benchmark"
-  DESTINATION include
+  DESTINATION ${include_install_dir}
   FILES_MATCHING PATTERN "*.*h")
+
+install(
+    FILES "${project_config}" "${version_config}"
+    DESTINATION "${config_install_dir}")
+
+install(
+    EXPORT "${targets_export_name}"
+    NAMESPACE "${namespace}"
+    DESTINATION "${config_install_dir}")
diff --git a/src/benchmark.cc b/src/benchmark.cc
index 95f6a25..54b5e67 100644
--- a/src/benchmark.cc
+++ b/src/benchmark.cc
@@ -37,6 +37,7 @@
 #include "colorprint.h"
 #include "commandlineflags.h"
 #include "complexity.h"
+#include "counter.h"
 #include "log.h"
 #include "mutex.h"
 #include "re.h"
@@ -90,6 +91,11 @@
               "environment variable is set to a terminal type that supports "
               "colors.");
 
+DEFINE_bool(benchmark_counters_tabular, false,
+            "Whether to use tabular format when printing user counters to "
+            "the console.  Valid values: 'true'/'yes'/1, 'false'/'no'/0."
+            "Defaults to false.");
+
 DEFINE_int32(v, 0, "The level of verbose logging to output");
 
 namespace benchmark {
@@ -145,6 +151,7 @@
     std::string report_label_;
     std::string error_message_;
     bool has_error_ = false;
+    UserCounters counters;
   };
   GUARDED_BY(GetBenchmarkMutex()) Result results;
 
@@ -249,6 +256,8 @@
     report.complexity_n = results.complexity_n;
     report.complexity = b.complexity;
     report.complexity_lambda = b.complexity_lambda;
+    report.counters = results.counters;
+    internal::Finish(&report.counters, seconds, b.threads);
   }
   return report;
 }
@@ -272,6 +281,7 @@
     results.bytes_processed += st.bytes_processed();
     results.items_processed += st.items_processed();
     results.complexity_n += st.complexity_length_n();
+    internal::Increment(&results.counters, st.counters);
   }
   manager->NotifyThreadComplete();
 }
@@ -281,7 +291,8 @@
     std::vector<BenchmarkReporter::Run>* complexity_reports) {
   std::vector<BenchmarkReporter::Run> reports;  // return value
 
-  size_t iters = 1;
+  const bool has_explicit_iteration_count = b.iterations != 0;
+  size_t iters = has_explicit_iteration_count ? b.iterations : 1;
   std::unique_ptr<internal::ThreadManager> manager;
   std::vector<std::thread> pool(b.threads - 1);
   const int repeats =
@@ -291,7 +302,7 @@
       (b.report_mode == internal::RM_Unspecified
            ? FLAGS_benchmark_report_aggregates_only
            : b.report_mode == internal::RM_ReportAggregatesOnly);
-  for (int i = 0; i < repeats; i++) {
+  for (int repetition_num = 0; repetition_num < repeats; repetition_num++) {
     for (;;) {
       // Try benchmark
       VLOG(2) << "Running " << b.name << " for " << iters << "\n";
@@ -327,10 +338,20 @@
 
       const double min_time =
           !IsZero(b.min_time) ? b.min_time : FLAGS_benchmark_min_time;
-      // If this was the first run, was elapsed time or cpu time large enough?
-      // If this is not the first run, go with the current value of iter.
-      if ((i > 0) || results.has_error_ || (iters >= kMaxIterations) ||
-          (seconds >= min_time) || (results.real_time_used >= 5 * min_time)) {
+
+      // Determine if this run should be reported; Either it has
+      // run for a sufficient amount of time or because an error was reported.
+      const bool should_report =  repetition_num > 0
+        || has_explicit_iteration_count // An exact iteration count was requested
+        || results.has_error_
+        || iters >= kMaxIterations
+        || seconds >= min_time // the elapsed time is large enough
+        // CPU time is specified but the elapsed real time greatly exceeds the
+        // minimum time. Note that user provided timers are except from this
+        // sanity check.
+        || ((results.real_time_used >= 5 * min_time) && !b.use_manual_time);
+
+      if (should_report) {
         BenchmarkReporter::Run report =
             CreateRunReport(b, results, iters, seconds);
         if (!report.error_occurred && b.complexity != oNone)
@@ -386,6 +407,7 @@
       items_processed_(0),
       complexity_n_(0),
       error_occurred_(false),
+      counters(),
       thread_index(thread_i),
       threads(n_threads),
       max_iterations(max_iters),
@@ -505,10 +527,10 @@
 }
 
 std::unique_ptr<BenchmarkReporter> CreateReporter(
-    std::string const& name, ConsoleReporter::OutputOptions allow_color) {
+    std::string const& name, ConsoleReporter::OutputOptions output_opts) {
   typedef std::unique_ptr<BenchmarkReporter> PtrType;
   if (name == "console") {
-    return PtrType(new ConsoleReporter(allow_color));
+    return PtrType(new ConsoleReporter(output_opts));
   } else if (name == "json") {
     return PtrType(new JSONReporter);
   } else if (name == "csv") {
@@ -520,6 +542,30 @@
 }
 
 }  // end namespace
+
+bool IsZero(double n) {
+  return std::abs(n) < std::numeric_limits<double>::epsilon();
+}
+
+ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color) {
+  int output_opts = ConsoleReporter::OO_Defaults;
+  if ((FLAGS_benchmark_color == "auto" && IsColorTerminal()) ||
+      IsTruthyFlagValue(FLAGS_benchmark_color)) {
+    output_opts |= ConsoleReporter::OO_Color;
+  } else {
+    output_opts &= ~ConsoleReporter::OO_Color;
+  }
+  if(force_no_color) {
+    output_opts &= ~ConsoleReporter::OO_Color;
+  }
+  if(FLAGS_benchmark_counters_tabular) {
+    output_opts |= ConsoleReporter::OO_Tabular;
+  } else {
+    output_opts &= ~ConsoleReporter::OO_Tabular;
+  }
+  return static_cast< ConsoleReporter::OutputOptions >(output_opts);
+}
+
 }  // end namespace internal
 
 size_t RunSpecifiedBenchmarks() {
@@ -541,16 +587,8 @@
   std::unique_ptr<BenchmarkReporter> default_console_reporter;
   std::unique_ptr<BenchmarkReporter> default_file_reporter;
   if (!console_reporter) {
-    auto output_opts = ConsoleReporter::OO_None;
-    if (FLAGS_benchmark_color == "auto")
-      output_opts = IsColorTerminal() ? ConsoleReporter::OO_Color
-                                      : ConsoleReporter::OO_None;
-    else
-      output_opts = IsTruthyFlagValue(FLAGS_benchmark_color)
-                        ? ConsoleReporter::OO_Color
-                        : ConsoleReporter::OO_None;
-    default_console_reporter =
-        internal::CreateReporter(FLAGS_benchmark_format, output_opts);
+    default_console_reporter = internal::CreateReporter(
+          FLAGS_benchmark_format, internal::GetOutputOptions());
     console_reporter = default_console_reporter.get();
   }
   auto& Out = console_reporter->GetOutputStream();
@@ -609,6 +647,7 @@
           "          [--benchmark_out=<filename>]\n"
           "          [--benchmark_out_format=<json|console|csv>]\n"
           "          [--benchmark_color={auto|true|false}]\n"
+          "          [--benchmark_counters_tabular={true|false}]\n"
           "          [--v=<verbosity>]\n");
   exit(0);
 }
@@ -633,8 +672,10 @@
         // "color_print" is the deprecated name for "benchmark_color".
         // TODO: Remove this.
         ParseStringFlag(argv[i], "color_print", &FLAGS_benchmark_color) ||
+        ParseBoolFlag(argv[i], "benchmark_counters_tabular",
+                        &FLAGS_benchmark_counters_tabular) ||
         ParseInt32Flag(argv[i], "v", &FLAGS_v)) {
-      for (int j = i; j != *argc; ++j) argv[j] = argv[j + 1];
+      for (int j = i; j != *argc - 1; ++j) argv[j] = argv[j + 1];
 
       --(*argc);
       --i;
@@ -664,4 +705,11 @@
   internal::LogLevel() = FLAGS_v;
 }
 
+bool ReportUnrecognizedArguments(int argc, char** argv) {
+  for (int i = 1; i < argc; ++i) {
+    fprintf(stderr, "%s: error: unrecognized command-line flag: %s\n", argv[0], argv[i]);
+  }
+  return argc > 1;
+}
+
 }  // end namespace benchmark
diff --git a/src/benchmark_api_internal.h b/src/benchmark_api_internal.h
index 8b97ce6..c220873 100644
--- a/src/benchmark_api_internal.h
+++ b/src/benchmark_api_internal.h
@@ -24,9 +24,11 @@
   bool use_manual_time;
   BigO complexity;
   BigOFunc* complexity_lambda;
+  UserCounters counters;
   bool last_benchmark_instance;
   int repetitions;
   double min_time;
+  size_t iterations;
   int threads;  // Number of concurrent threads to us
 };
 
@@ -34,13 +36,10 @@
                             std::vector<Benchmark::Instance>* benchmarks,
                             std::ostream* Err);
 
-namespace {
+bool IsZero(double n);
 
-bool IsZero(double n) {
-  return std::abs(n) < std::numeric_limits<double>::epsilon();
-}
+ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color = false);
 
-}  // end namespace
 }  // end namespace internal
 }  // end namespace benchmark
 
diff --git a/src/benchmark_register.cc b/src/benchmark_register.cc
index 4e580d8..ed70d82 100644
--- a/src/benchmark_register.cc
+++ b/src/benchmark_register.cc
@@ -31,6 +31,7 @@
 #include <fstream>
 #include <iostream>
 #include <memory>
+#include <sstream>
 #include <thread>
 
 #include "check.h"
@@ -69,6 +70,9 @@
   // Registers a benchmark family and returns the index assigned to it.
   size_t AddBenchmark(std::unique_ptr<Benchmark> family);
 
+  // Clear all registered benchmark families.
+  void ClearBenchmarks();
+
   // Extract the list of benchmark instances that match the specified
   // regular expression.
   bool FindBenchmarks(const std::string& re,
@@ -94,6 +98,12 @@
   return index;
 }
 
+void BenchmarkFamilies::ClearBenchmarks() {
+  MutexLock l(mutex_);
+  families_.clear();
+  families_.shrink_to_fit();
+}
+
 bool BenchmarkFamilies::FindBenchmarks(
     const std::string& spec, std::vector<Benchmark::Instance>* benchmarks,
     std::ostream* ErrStream) {
@@ -143,6 +153,7 @@
         instance.time_unit = family->time_unit_;
         instance.range_multiplier = family->range_multiplier_;
         instance.min_time = family->min_time_;
+        instance.iterations = family->iterations_;
         instance.repetitions = family->repetitions_;
         instance.use_real_time = family->use_real_time_;
         instance.use_manual_time = family->use_manual_time_;
@@ -162,17 +173,18 @@
                   StringPrintF("%s:", family->arg_names_[arg_i].c_str());
             }
           }
-
-          AppendHumanReadable(arg, &instance.name);
+          
+          instance.name += StringPrintF("%d", arg);
           ++arg_i;
         }
 
-        if (!IsZero(family->min_time_)) {
+        if (!IsZero(family->min_time_))
           instance.name += StringPrintF("/min_time:%0.3f", family->min_time_);
-        }
-        if (family->repetitions_ != 0) {
+        if (family->iterations_ != 0)
+          instance.name += StringPrintF("/iterations:%d", family->iterations_);
+        if (family->repetitions_ != 0)
           instance.name += StringPrintF("/repeats:%d", family->repetitions_);
-        }
+
         if (family->use_manual_time_) {
           instance.name += "/manual_time";
         } else if (family->use_real_time_) {
@@ -219,6 +231,7 @@
       time_unit_(kNanosecond),
       range_multiplier_(kRangeMultiplier),
       min_time_(0),
+      iterations_(0),
       repetitions_(0),
       use_real_time_(false),
       use_manual_time_(false),
@@ -344,6 +357,22 @@
   return this;
 }
 
+
+Benchmark* Benchmark::MinTime(double t) {
+  CHECK(t > 0.0);
+  CHECK(iterations_ == 0);
+  min_time_ = t;
+  return this;
+}
+
+
+Benchmark* Benchmark::Iterations(size_t n) {
+  CHECK(n > 0);
+  CHECK(IsZero(min_time_));
+  iterations_ = n;
+  return this;
+}
+
 Benchmark* Benchmark::Repetitions(int n) {
   CHECK(n > 0);
   repetitions_ = n;
@@ -355,12 +384,6 @@
   return this;
 }
 
-Benchmark* Benchmark::MinTime(double t) {
-  CHECK(t > 0.0);
-  min_time_ = t;
-  return this;
-}
-
 Benchmark* Benchmark::UseRealTime() {
   CHECK(!use_manual_time_)
       << "Cannot set UseRealTime and UseManualTime simultaneously.";
@@ -436,4 +459,9 @@
 void FunctionBenchmark::Run(State& st) { func_(st); }
 
 }  // end namespace internal
+
+void ClearRegisteredBenchmarks() {
+  internal::BenchmarkFamilies::GetInstance()->ClearBenchmarks();
+}
+
 }  // end namespace benchmark
diff --git a/src/check.h b/src/check.h
index 6f1fe0c..73bead2 100644
--- a/src/check.h
+++ b/src/check.h
@@ -3,6 +3,7 @@
 
 #include <cstdlib>
 #include <ostream>
+#include <cmath>
 
 #include "internal_macros.h"
 #include "log.h"
@@ -68,4 +69,11 @@
 #define CHECK_GT(a, b) CHECK((a) > (b))
 #define CHECK_LT(a, b) CHECK((a) < (b))
 
+#define CHECK_FLOAT_EQ(a, b, eps) CHECK(std::fabs((a) - (b)) <  (eps))
+#define CHECK_FLOAT_NE(a, b, eps) CHECK(std::fabs((a) - (b)) >= (eps))
+#define CHECK_FLOAT_GE(a, b, eps) CHECK((a) - (b) > -(eps))
+#define CHECK_FLOAT_LE(a, b, eps) CHECK((b) - (a) > -(eps))
+#define CHECK_FLOAT_GT(a, b, eps) CHECK((a) - (b) >  (eps))
+#define CHECK_FLOAT_LT(a, b, eps) CHECK((b) - (a) >  (eps))
+
 #endif  // CHECK_H_
diff --git a/src/colorprint.cc b/src/colorprint.cc
index 513376b..2dec4a8 100644
--- a/src/colorprint.cc
+++ b/src/colorprint.cc
@@ -89,7 +89,7 @@
 
   std::size_t size = 256;
   char local_buff[256];
-  auto ret = std::vsnprintf(local_buff, size, msg, args_cp);
+  auto ret = vsnprintf(local_buff, size, msg, args_cp);
 
   va_end(args_cp);
 
@@ -104,7 +104,7 @@
     // we did not provide a long enough buffer on our first attempt.
     size = (size_t)ret + 1;  // + 1 for the null byte
     std::unique_ptr<char[]> buff(new char[size]);
-    ret = std::vsnprintf(buff.get(), size, msg, args);
+    ret = vsnprintf(buff.get(), size, msg, args);
     CHECK(ret > 0 && ((size_t)ret) < size);
     return buff.get();
   }
diff --git a/src/complexity.cc b/src/complexity.cc
index dfab791..24b3ed4 100644
--- a/src/complexity.cc
+++ b/src/complexity.cc
@@ -35,9 +35,9 @@
     case oNCubed:
       return [](int n) -> double { return std::pow(n, 3); };
     case oLogN:
-      return [](int n) { return std::log2(n); };
+      return [](int n) { return log2(n); };
     case oNLogN:
-      return [](int n) { return n * std::log2(n); };
+      return [](int n) { return n * log2(n); };
     case o1:
     default:
       return [](int) { return 1.0; };
@@ -171,6 +171,22 @@
   // All repetitions should be run with the same number of iterations so we
   // can take this information from the first benchmark.
   int64_t const run_iterations = reports.front().iterations;
+  // create stats for user counters
+  struct CounterStat {
+    Counter c;
+    Stat1_d s;
+  };
+  std::map< std::string, CounterStat > counter_stats;
+  for(Run const& r : reports) {
+    for(auto const& cnt : r.counters) {
+      auto it = counter_stats.find(cnt.first);
+      if(it == counter_stats.end()) {
+        counter_stats.insert({cnt.first, {cnt.second, Stat1_d{}}});
+      } else {
+        CHECK_EQ(counter_stats[cnt.first].c.flags, cnt.second.flags);
+      }
+    }
+  }
 
   // Populate the accumulators.
   for (Run const& run : reports) {
@@ -178,11 +194,17 @@
     CHECK_EQ(run_iterations, run.iterations);
     if (run.error_occurred) continue;
     real_accumulated_time_stat +=
-        Stat1_d(run.real_accumulated_time / run.iterations, run.iterations);
+        Stat1_d(run.real_accumulated_time / run.iterations);
     cpu_accumulated_time_stat +=
-        Stat1_d(run.cpu_accumulated_time / run.iterations, run.iterations);
-    items_per_second_stat += Stat1_d(run.items_per_second, run.iterations);
-    bytes_per_second_stat += Stat1_d(run.bytes_per_second, run.iterations);
+        Stat1_d(run.cpu_accumulated_time / run.iterations);
+    items_per_second_stat += Stat1_d(run.items_per_second);
+    bytes_per_second_stat += Stat1_d(run.bytes_per_second);
+    // user counters
+    for(auto const& cnt : run.counters) {
+      auto it = counter_stats.find(cnt.first);
+      CHECK_NE(it, counter_stats.end());
+      it->second.s += Stat1_d(cnt.second);
+    }
   }
 
   // Get the data from the accumulator to BenchmarkReporter::Run's.
@@ -196,6 +218,11 @@
   mean_data.bytes_per_second = bytes_per_second_stat.Mean();
   mean_data.items_per_second = items_per_second_stat.Mean();
   mean_data.time_unit = reports[0].time_unit;
+  // user counters
+  for(auto const& kv : counter_stats) {
+    auto c = Counter(kv.second.s.Mean(), counter_stats[kv.first].c.flags);
+    mean_data.counters[kv.first] = c;
+  }
 
   // Only add label to mean/stddev if it is same for all runs
   mean_data.report_label = reports[0].report_label;
@@ -215,6 +242,11 @@
   stddev_data.bytes_per_second = bytes_per_second_stat.StdDev();
   stddev_data.items_per_second = items_per_second_stat.StdDev();
   stddev_data.time_unit = reports[0].time_unit;
+  // user counters
+  for(auto const& kv : counter_stats) {
+    auto c = Counter(kv.second.s.StdDev(), counter_stats[kv.first].c.flags);
+    stddev_data.counters[kv.first] = c;
+  }
 
   results.push_back(mean_data);
   results.push_back(stddev_data);
@@ -263,6 +295,11 @@
   big_o.report_big_o = true;
   big_o.complexity = result_cpu.complexity;
 
+  // All the time results are reported after being multiplied by the
+  // time unit multiplier. But since RMS is a relative quantity it
+  // should not be multiplied at all. So, here, we _divide_ it by the
+  // multiplier so that when it is multiplied later the result is the
+  // correct one.
   double multiplier = GetTimeUnitMultiplier(reports[0].time_unit);
 
   // Only add label to mean/stddev if it is same for all runs
@@ -275,6 +312,9 @@
   rms.cpu_accumulated_time = result_cpu.rms / multiplier;
   rms.report_rms = true;
   rms.complexity = result_cpu.complexity;
+  // don't forget to keep the time unit, or we won't be able to
+  // recover the correct value.
+  rms.time_unit = reports[0].time_unit;
 
   results.push_back(big_o);
   results.push_back(rms);
diff --git a/src/console_reporter.cc b/src/console_reporter.cc
index 7e0cca3..7c3b7d8 100644
--- a/src/console_reporter.cc
+++ b/src/console_reporter.cc
@@ -14,6 +14,7 @@
 
 #include "benchmark/reporter.h"
 #include "complexity.h"
+#include "counter.h"
 
 #include <algorithm>
 #include <cstdint>
@@ -34,27 +35,59 @@
 
 bool ConsoleReporter::ReportContext(const Context& context) {
   name_field_width_ = context.name_field_width;
+  printed_header_ = false;
+  prev_counters_.clear();
 
   PrintBasicContext(&GetErrorStream(), context);
 
 #ifdef BENCHMARK_OS_WINDOWS
-  if (color_output_ && &std::cout != &GetOutputStream()) {
+  if ((output_options_ & OO_Color) && &std::cout != &GetOutputStream()) {
     GetErrorStream()
         << "Color printing is only supported for stdout on windows."
            " Disabling color printing\n";
-    color_output_ = false;
+    output_options_ = static_cast< OutputOptions >(output_options_ & ~OO_Color);
   }
 #endif
-  std::string str =
-      FormatString("%-*s %13s %13s %10s\n", static_cast<int>(name_field_width_),
-                   "Benchmark", "Time", "CPU", "Iterations");
-  GetOutputStream() << str << std::string(str.length() - 1, '-') << "\n";
 
   return true;
 }
 
+void ConsoleReporter::PrintHeader(const Run& run) {
+  std::string str = FormatString("%-*s %13s %13s %10s", static_cast<int>(name_field_width_),
+                                 "Benchmark", "Time", "CPU", "Iterations");
+  if(!run.counters.empty()) {
+    if(output_options_ & OO_Tabular) {
+      for(auto const& c : run.counters) {
+        str += FormatString(" %10s", c.first.c_str());
+      }
+    } else {
+      str += " UserCounters...";
+    }
+  }
+  str += "\n";
+  std::string line = std::string(str.length(), '-');
+  GetOutputStream() << line << "\n" << str << line << "\n";
+}
+
 void ConsoleReporter::ReportRuns(const std::vector<Run>& reports) {
-  for (const auto& run : reports) PrintRunData(run);
+  for (const auto& run : reports) {
+    // print the header:
+    // --- if none was printed yet
+    bool print_header = !printed_header_;
+    // --- or if the format is tabular and this run
+    //     has different fields from the prev header
+    print_header |= (output_options_ & OO_Tabular) &&
+                    (!internal::SameNames(run.counters, prev_counters_));
+    if (print_header) {
+      printed_header_ = true;
+      prev_counters_ = run.counters;
+      PrintHeader(run);
+    }
+    // As an alternative to printing the headers like this, we could sort
+    // the benchmarks by header and then print. But this would require
+    // waiting for the full results before printing, or printing twice.
+    PrintRunData(run);
+  }
 }
 
 static void IgnoreColorPrint(std::ostream& out, LogColor, const char* fmt,
@@ -68,8 +101,8 @@
 void ConsoleReporter::PrintRunData(const Run& result) {
   typedef void(PrinterFn)(std::ostream&, LogColor, const char*, ...);
   auto& Out = GetOutputStream();
-  PrinterFn* printer =
-      color_output_ ? (PrinterFn*)ColorPrintf : IgnoreColorPrint;
+  PrinterFn* printer = (output_options_ & OO_Color) ?
+                         (PrinterFn*)ColorPrintf : IgnoreColorPrint;
   auto name_color =
       (result.report_big_o || result.report_rms) ? COLOR_BLUE : COLOR_GREEN;
   printer(Out, name_color, "%-*s ", name_field_width_,
@@ -114,6 +147,21 @@
     printer(Out, COLOR_CYAN, "%10lld", result.iterations);
   }
 
+  for (auto& c : result.counters) {
+    auto const& s = HumanReadableNumber(c.second.value);
+    if (output_options_ & OO_Tabular) {
+      if (c.second.flags & Counter::kIsRate) {
+        printer(Out, COLOR_DEFAULT, " %8s/s", s.c_str());
+      } else {
+        printer(Out, COLOR_DEFAULT, " %10s", s.c_str());
+      }
+    } else {
+      const char* unit = (c.second.flags & Counter::kIsRate) ? "/s" : "";
+      printer(Out, COLOR_DEFAULT, " %s=%s%s", c.first.c_str(), s.c_str(),
+              unit);
+    }
+  }
+
   if (!rate.empty()) {
     printer(Out, COLOR_DEFAULT, " %*s", 13, rate.c_str());
   }
diff --git a/src/counter.cc b/src/counter.cc
new file mode 100644
index 0000000..ed1aa04
--- /dev/null
+++ b/src/counter.cc
@@ -0,0 +1,68 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "counter.h"
+
+namespace benchmark {
+namespace internal {
+
+double Finish(Counter const& c, double cpu_time, double num_threads) {
+  double v = c.value;
+  if (c.flags & Counter::kIsRate) {
+    v /= cpu_time;
+  }
+  if (c.flags & Counter::kAvgThreads) {
+    v /= num_threads;
+  }
+  return v;
+}
+
+void Finish(UserCounters *l, double cpu_time, double num_threads) {
+  for (auto &c : *l) {
+    c.second.value = Finish(c.second, cpu_time, num_threads);
+  }
+}
+
+void Increment(UserCounters *l, UserCounters const& r) {
+  // add counters present in both or just in *l
+  for (auto &c : *l) {
+    auto it = r.find(c.first);
+    if (it != r.end()) {
+      c.second.value = c.second + it->second;
+    }
+  }
+  // add counters present in r, but not in *l
+  for (auto const &tc : r) {
+    auto it = l->find(tc.first);
+    if (it == l->end()) {
+      (*l)[tc.first] = tc.second;
+    }
+  }
+}
+
+bool SameNames(UserCounters const& l, UserCounters const& r) {
+  if (&l == &r) return true;
+  if (l.size() != r.size()) {
+    return false;
+  }
+  for (auto const& c : l) {
+    if (r.find(c.first) == r.end()) {
+      return false;
+    }
+  }
+  return true;
+}
+
+} // end namespace internal
+} // end namespace benchmark
diff --git a/src/counter.h b/src/counter.h
new file mode 100644
index 0000000..bbb92d9
--- /dev/null
+++ b/src/counter.h
@@ -0,0 +1,26 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "benchmark/benchmark_api.h"
+
+namespace benchmark {
+
+// these counter-related functions are hidden to reduce API surface.
+namespace internal {
+void Finish(UserCounters *l, double time, double num_threads);
+void Increment(UserCounters *l, UserCounters const& r);
+bool SameNames(UserCounters const& l, UserCounters const& r);
+} // end namespace internal
+
+} //end namespace benchmark
diff --git a/src/csv_reporter.cc b/src/csv_reporter.cc
index 18ab3b6..b8072fd 100644
--- a/src/csv_reporter.cc
+++ b/src/csv_reporter.cc
@@ -24,6 +24,7 @@
 
 #include "string_util.h"
 #include "timers.h"
+#include "check.h"
 
 // File format reference: http://edoceo.com/utilitas/csv-file-format.
 
@@ -38,21 +39,51 @@
 
 bool CSVReporter::ReportContext(const Context& context) {
   PrintBasicContext(&GetErrorStream(), context);
-
-  std::ostream& Out = GetOutputStream();
-  for (auto B = elements.begin(); B != elements.end();) {
-    Out << *B++;
-    if (B != elements.end()) Out << ",";
-  }
-  Out << "\n";
   return true;
 }
 
-void CSVReporter::ReportRuns(const std::vector<Run>& reports) {
-  for (const auto& run : reports) PrintRunData(run);
+void CSVReporter::ReportRuns(const std::vector<Run> & reports) {
+  std::ostream& Out = GetOutputStream();
+
+  if (!printed_header_) {
+    // save the names of all the user counters
+    for (const auto& run : reports) {
+      for (const auto& cnt : run.counters) {
+        user_counter_names_.insert(cnt.first);
+      }
+    }
+
+    // print the header
+    for (auto B = elements.begin(); B != elements.end();) {
+      Out << *B++;
+      if (B != elements.end()) Out << ",";
+    }
+    for (auto B = user_counter_names_.begin(); B != user_counter_names_.end();) {
+      Out << ",\"" << *B++ << "\"";
+    }
+    Out << "\n";
+
+    printed_header_ = true;
+  } else {
+    // check that all the current counters are saved in the name set
+    for (const auto& run : reports) {
+      for (const auto& cnt : run.counters) {
+        CHECK(user_counter_names_.find(cnt.first) != user_counter_names_.end())
+              << "All counters must be present in each run. "
+              << "Counter named \"" << cnt.first
+              << "\" was not in a run after being added to the header";
+      }
+    }
+  }
+
+  // print results for each run
+  for (const auto& run : reports) {
+    PrintRunData(run);
+  }
+
 }
 
-void CSVReporter::PrintRunData(const Run& run) {
+void CSVReporter::PrintRunData(const Run & run) {
   std::ostream& Out = GetOutputStream();
 
   // Field with embedded double-quote characters must be doubled and the field
@@ -102,6 +133,16 @@
     Out << "\"" << label << "\"";
   }
   Out << ",,";  // for error_occurred and error_message
+
+  // Print user counters
+  for (const auto &ucn : user_counter_names_) {
+    auto it = run.counters.find(ucn);
+    if(it == run.counters.end()) {
+      Out << ",";
+    } else {
+      Out << "," << it->second;
+    }
+  }
   Out << '\n';
 }
 
diff --git a/src/cycleclock.h b/src/cycleclock.h
index e4825d4..e0f9b01 100644
--- a/src/cycleclock.h
+++ b/src/cycleclock.h
@@ -43,6 +43,11 @@
 
 #ifndef BENCHMARK_OS_WINDOWS
 #include <sys/time.h>
+#include <time.h>
+#endif
+
+#ifdef BENCHMARK_OS_EMSCRIPTEN
+#include <emscripten.h>
 #endif
 
 namespace benchmark {
@@ -65,6 +70,10 @@
   // counter pauses; it does not continue counting, nor does it
   // reset to zero.
   return mach_absolute_time();
+#elif defined(BENCHMARK_OS_EMSCRIPTEN)
+  // this goes above x86-specific code because old versions of Emscripten
+  // define __x86_64__, although they have nothing to do with it.
+  return static_cast<int64_t>(emscripten_get_now() * 1e+6);
 #elif defined(__i386__)
   int64_t ret;
   __asm__ volatile("rdtsc" : "=A"(ret));
@@ -79,7 +88,7 @@
   asm("mftbu %0" : "=r"(tbu0));
   asm("mftb  %0" : "=r"(tbl));
   asm("mftbu %0" : "=r"(tbu1));
-  tbl &= -static_cast<int64>(tbu0 == tbu1);
+  tbl &= -static_cast<int64_t>(tbu0 == tbu1);
   // high 32 bits in tbu1; low 32 bits in tbl  (tbu0 is garbage)
   return (tbu1 << 32) | tbl;
 #elif defined(__sparc__)
@@ -99,6 +108,22 @@
   _asm rdtsc
 #elif defined(COMPILER_MSVC)
   return __rdtsc();
+#elif defined(BENCHMARK_OS_NACL)
+  // Native Client validator on x86/x86-64 allows RDTSC instructions,
+  // and this case is handled above. Native Client validator on ARM
+  // rejects MRC instructions (used in the ARM-specific sequence below),
+  // so we handle it here. Portable Native Client compiles to
+  // architecture-agnostic bytecode, which doesn't provide any
+  // cycle counter access mnemonics.
+
+  // Native Client does not provide any API to access cycle counter.
+  // Use clock_gettime(CLOCK_MONOTONIC, ...) instead of gettimeofday
+  // because is provides nanosecond resolution (which is noticable at
+  // least for PNaCl modules running on x86 Mac & Linux).
+  // Initialize to always return 0 if clock_gettime fails.
+  struct timespec ts = { 0, 0 };
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+  return static_cast<int64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
 #elif defined(__aarch64__)
   // System timer of ARMv8 runs at a different frequency than the CPU's.
   // The frequency is fixed, typically in the range 1-50MHz.  It can be
@@ -108,7 +133,9 @@
   asm volatile("mrs %0, cntvct_el0" : "=r"(virtual_timer_value));
   return virtual_timer_value;
 #elif defined(__ARM_ARCH)
-#if (__ARM_ARCH >= 6)  // V6 is the earliest arch that has a standard cyclecount
+  // V6 is the earliest arch that has a standard cyclecount
+  // Native Client validator doesn't allow MRC instructions.
+#if (__ARM_ARCH >= 6)
   uint32_t pmccntr;
   uint32_t pmuseren;
   uint32_t pmcntenset;
diff --git a/src/internal_macros.h b/src/internal_macros.h
index e8efcbb..ab9dd85 100644
--- a/src/internal_macros.h
+++ b/src/internal_macros.h
@@ -30,13 +30,26 @@
 #elif defined(_WIN32)
 #define BENCHMARK_OS_WINDOWS 1
 #elif defined(__APPLE__)
-// TODO(ericwf) This doesn't actually check that it is a Mac OSX system. Just
-// that it is an apple system.
-#define BENCHMARK_OS_MACOSX 1
+#include "TargetConditionals.h"
+  #if defined(TARGET_OS_MAC)
+    #define BENCHMARK_OS_MACOSX 1
+    #if defined(TARGET_OS_IPHONE)
+      #define BENCHMARK_OS_IOS 1
+    #endif
+  #endif
 #elif defined(__FreeBSD__)
 #define BENCHMARK_OS_FREEBSD 1
 #elif defined(__linux__)
 #define BENCHMARK_OS_LINUX 1
+#elif defined(__native_client__)
+#define BENCHMARK_OS_NACL 1
+#elif defined(EMSCRIPTEN)
+#define BENCHMARK_OS_EMSCRIPTEN 1
+#endif
+
+#if !__has_feature(cxx_exceptions) && !defined(__cpp_exceptions) \
+     && !defined(__EXCEPTIONS)
+#define BENCHMARK_HAS_NO_EXCEPTIONS
 #endif
 
 #endif  // BENCHMARK_INTERNAL_MACROS_H_
diff --git a/src/json_reporter.cc b/src/json_reporter.cc
index cea5f9b..5a65308 100644
--- a/src/json_reporter.cc
+++ b/src/json_reporter.cc
@@ -154,10 +154,15 @@
         << indent
         << FormatKV("items_per_second", RoundDouble(run.items_per_second));
   }
+  for(auto &c : run.counters) {
+    out << ",\n"
+        << indent
+        << FormatKV(c.first, RoundDouble(c.second));
+  }
   if (!run.report_label.empty()) {
     out << ",\n" << indent << FormatKV("label", run.report_label);
   }
   out << '\n';
 }
 
-}  // end namespace benchmark
+} // end namespace benchmark
diff --git a/src/re.h b/src/re.h
index af4a498..01e9736 100644
--- a/src/re.h
+++ b/src/re.h
@@ -15,6 +15,15 @@
 #ifndef BENCHMARK_RE_H_
 #define BENCHMARK_RE_H_
 
+#include "internal_macros.h"
+
+// Prefer C regex libraries when compiling w/o exceptions so that we can
+// correctly report errors.
+#if defined(BENCHMARK_HAS_NO_EXCEPTIONS) && defined(HAVE_STD_REGEX) && \
+    (defined(HAVE_GNU_POSIX_REGEX) || defined(HAVE_POSIX_REGEX))
+#undef HAVE_STD_REGEX
+#endif
+
 #if defined(HAVE_STD_REGEX)
 #include <regex>
 #elif defined(HAVE_GNU_POSIX_REGEX)
@@ -62,15 +71,20 @@
 #if defined(HAVE_STD_REGEX)
 
 inline bool Regex::Init(const std::string& spec, std::string* error) {
+#ifdef BENCHMARK_HAS_NO_EXCEPTIONS
+  ((void)error); // suppress unused warning
+#else
   try {
+#endif
     re_ = std::regex(spec, std::regex_constants::extended);
-
     init_ = true;
+#ifndef BENCHMARK_HAS_NO_EXCEPTIONS
   } catch (const std::regex_error& e) {
     if (error) {
       *error = e.what();
     }
   }
+#endif
   return init_;
 }
 
diff --git a/src/sleep.cc b/src/sleep.cc
index 918abc4..54aa04a 100644
--- a/src/sleep.cc
+++ b/src/sleep.cc
@@ -15,6 +15,7 @@
 #include "sleep.h"
 
 #include <cerrno>
+#include <cstdlib>
 #include <ctime>
 
 #include "internal_macros.h"
@@ -40,7 +41,7 @@
 }
 
 void SleepForMilliseconds(int milliseconds) {
-  SleepForMicroseconds(static_cast<int>(milliseconds) * kNumMicrosPerMilli);
+  SleepForMicroseconds(milliseconds * kNumMicrosPerMilli);
 }
 
 void SleepForSeconds(double seconds) {
diff --git a/src/sleep.h b/src/sleep.h
index f1e515c..f98551a 100644
--- a/src/sleep.h
+++ b/src/sleep.h
@@ -1,14 +1,12 @@
 #ifndef BENCHMARK_SLEEP_H_
 #define BENCHMARK_SLEEP_H_
 
-#include <cstdint>
-
 namespace benchmark {
-const int64_t kNumMillisPerSecond = 1000LL;
-const int64_t kNumMicrosPerMilli = 1000LL;
-const int64_t kNumMicrosPerSecond = kNumMillisPerSecond * 1000LL;
-const int64_t kNumNanosPerMicro = 1000LL;
-const int64_t kNumNanosPerSecond = kNumNanosPerMicro * kNumMicrosPerSecond;
+const int kNumMillisPerSecond = 1000;
+const int kNumMicrosPerMilli = 1000;
+const int kNumMicrosPerSecond = kNumMillisPerSecond * 1000;
+const int kNumNanosPerMicro = 1000;
+const int kNumNanosPerSecond = kNumNanosPerMicro * kNumMicrosPerSecond;
 
 void SleepForMilliseconds(int milliseconds);
 void SleepForSeconds(double seconds);
diff --git a/src/stat.h b/src/stat.h
index 136c3aa..d356875 100644
--- a/src/stat.h
+++ b/src/stat.h
@@ -119,18 +119,22 @@
     if (numsamples_ == 0) return VType();
     VType mean = sum_ * (1.0 / numsamples_);
     if (stddev) {
-      VType avg_squares = sum_squares_ * (1.0 / numsamples_);
-      *stddev = Sqrt(avg_squares - Sqr(mean));
+      // Sample standard deviation is undefined for n = 1
+      if (numsamples_ == 1) {
+        *stddev = VType();
+      } else {
+        VType avg_squares = sum_squares_ * (1.0 / numsamples_);
+        *stddev = Sqrt(numsamples_ / (numsamples_ - 1.0) * (avg_squares - Sqr(mean)));
+      }
     }
     return mean;
   }
 
   // Return the standard deviation of the sample set
   VType StdDev() const {
-    if (numsamples_ == 0) return VType();
-    VType mean = Mean();
-    VType avg_squares = sum_squares_ * (1.0 / numsamples_);
-    return Sqrt(avg_squares - Sqr(mean));
+    VType stddev = VType();
+    Mean(&stddev);
+    return stddev;
   }
 
  private:
diff --git a/src/string_util.cc b/src/string_util.cc
index 4cefbfb..cd4e7cf 100644
--- a/src/string_util.cc
+++ b/src/string_util.cc
@@ -45,6 +45,8 @@
       std::max(thresh, 1.0 / std::pow(10.0, precision));
   const double big_threshold = adjusted_threshold * one_k;
   const double small_threshold = adjusted_threshold;
+  // Values in ]simple_threshold,small_threshold[ will be printed as-is
+  const double simple_threshold = 0.01;
 
   if (val > big_threshold) {
     // Positive powers
@@ -62,14 +64,16 @@
     *exponent = 0;
   } else if (val < small_threshold) {
     // Negative powers
-    double scaled = val;
-    for (size_t i = 0; i < arraysize(kSmallSIUnits); ++i) {
-      scaled *= one_k;
-      if (scaled >= small_threshold) {
-        mantissa_stream << scaled;
-        *exponent = -static_cast<int64_t>(i + 1);
-        *mantissa = mantissa_stream.str();
-        return;
+    if (val < simple_threshold) {
+      double scaled = val;
+      for (size_t i = 0; i < arraysize(kSmallSIUnits); ++i) {
+        scaled *= one_k;
+        if (scaled >= small_threshold) {
+          mantissa_stream << scaled;
+          *exponent = -static_cast<int64_t>(i + 1);
+          *mantissa = mantissa_stream.str();
+          return;
+        }
       }
     }
     mantissa_stream << val;
diff --git a/src/sysinfo.cc b/src/sysinfo.cc
index dd1e663..7feb79e 100644
--- a/src/sysinfo.cc
+++ b/src/sysinfo.cc
@@ -75,7 +75,9 @@
     char line[1024];
     char* err;
     memset(line, '\0', sizeof(line));
-    CHECK(read(fd, line, sizeof(line) - 1));
+    ssize_t read_err = read(fd, line, sizeof(line) - 1);
+    ((void)read_err); // prevent unused warning
+    CHECK(read_err >= 0);
     const long temp_value = strtol(line, &err, 10);
     if (line[0] != '\0' && (*err == '\n' || *err == '\0')) {
       *value = temp_value;
@@ -295,8 +297,13 @@
       (size == sizeof(cpu_freq))) {
     cpuinfo_cycles_per_second = cpu_freq;
   } else {
+    #if defined BENCHMARK_OS_IOS
+    fprintf(stderr, "CPU frequency cannot be detected. \n");
+    cpuinfo_cycles_per_second = 0;
+    #else
     fprintf(stderr, "%s\n", strerror(errno));
     std::exit(EXIT_FAILURE);
+    #endif
   }
 #else
   // Generic cycles per second counter
diff --git a/src/timers.cc b/src/timers.cc
index fadc08f..8d56e8a 100644
--- a/src/timers.cc
+++ b/src/timers.cc
@@ -35,6 +35,10 @@
 #endif
 #endif
 
+#ifdef BENCHMARK_OS_EMSCRIPTEN
+#include <emscripten.h>
+#endif
+
 #include <cerrno>
 #include <cstdint>
 #include <cstdio>
@@ -100,14 +104,7 @@
 }  // end namespace
 
 double ProcessCPUUsage() {
-// FIXME We want to use clock_gettime, but its not available in MacOS 10.11. See
-// https://github.com/google/benchmark/pull/292
-#if defined(CLOCK_PROCESS_CPUTIME_ID) && !defined(BENCHMARK_OS_MACOSX)
-  struct timespec spec;
-  if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &spec) == 0)
-    return MakeTime(spec);
-  DiagnoseAndExit("clock_gettime(CLOCK_PROCESS_CPUTIME_ID, ...) failed");
-#elif defined(BENCHMARK_OS_WINDOWS)
+#if defined(BENCHMARK_OS_WINDOWS)
   HANDLE proc = GetCurrentProcess();
   FILETIME creation_time;
   FILETIME exit_time;
@@ -117,21 +114,28 @@
                       &user_time))
     return MakeTime(kernel_time, user_time);
   DiagnoseAndExit("GetProccessTimes() failed");
+#elif defined(BENCHMARK_OS_EMSCRIPTEN)
+  // clock_gettime(CLOCK_PROCESS_CPUTIME_ID, ...) returns 0 on Emscripten.
+  // Use Emscripten-specific API. Reported CPU time would be exactly the
+  // same as total time, but this is ok because there aren't long-latency
+  // syncronous system calls in Emscripten.
+  return emscripten_get_now() * 1e-3;
+#elif defined(CLOCK_PROCESS_CPUTIME_ID) && !defined(BENCHMARK_OS_MACOSX)
+  // FIXME We want to use clock_gettime, but its not available in MacOS 10.11. See
+  // https://github.com/google/benchmark/pull/292
+  struct timespec spec;
+  if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &spec) == 0)
+    return MakeTime(spec);
+  DiagnoseAndExit("clock_gettime(CLOCK_PROCESS_CPUTIME_ID, ...) failed");
 #else
   struct rusage ru;
   if (getrusage(RUSAGE_SELF, &ru) == 0) return MakeTime(ru);
-  DiagnoseAndExit("clock_gettime(CLOCK_PROCESS_CPUTIME_ID, ...) failed");
+  DiagnoseAndExit("getrusage(RUSAGE_SELF, ...) failed");
 #endif
 }
 
 double ThreadCPUUsage() {
-// FIXME We want to use clock_gettime, but its not available in MacOS 10.11. See
-// https://github.com/google/benchmark/pull/292
-#if defined(CLOCK_THREAD_CPUTIME_ID) && !defined(BENCHMARK_OS_MACOSX)
-  struct timespec ts;
-  if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts) == 0) return MakeTime(ts);
-  DiagnoseAndExit("clock_gettime(CLOCK_THREAD_CPUTIME_ID, ...) failed");
-#elif defined(BENCHMARK_OS_WINDOWS)
+#if defined(BENCHMARK_OS_WINDOWS)
   HANDLE this_thread = GetCurrentThread();
   FILETIME creation_time;
   FILETIME exit_time;
@@ -141,6 +145,8 @@
                  &user_time);
   return MakeTime(kernel_time, user_time);
 #elif defined(BENCHMARK_OS_MACOSX)
+  // FIXME We want to use clock_gettime, but its not available in MacOS 10.11. See
+  // https://github.com/google/benchmark/pull/292
   mach_msg_type_number_t count = THREAD_BASIC_INFO_COUNT;
   thread_basic_info_data_t info;
   mach_port_t thread = pthread_mach_thread_np(pthread_self());
@@ -149,6 +155,13 @@
     return MakeTime(info);
   }
   DiagnoseAndExit("ThreadCPUUsage() failed when evaluating thread_info");
+#elif defined(BENCHMARK_OS_EMSCRIPTEN)
+  // Emscripten doesn't support traditional threads
+  return ProcessCPUUsage();
+#elif defined(CLOCK_THREAD_CPUTIME_ID)
+  struct timespec ts;
+  if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts) == 0) return MakeTime(ts);
+  DiagnoseAndExit("clock_gettime(CLOCK_THREAD_CPUTIME_ID, ...) failed");
 #else
 #error Per-thread timing is not available on your system.
 #endif
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 8724598..b55612b 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,6 +1,26 @@
 # Enable the tests
 
 find_package(Threads REQUIRED)
+include(CheckCXXCompilerFlag)
+
+# NOTE: Some tests use `<cassert>` to perform the test. Therefore we must
+# strip -DNDEBUG from the default CMake flags in DEBUG mode.
+string(TOUPPER "${CMAKE_BUILD_TYPE}" uppercase_CMAKE_BUILD_TYPE)
+if( NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG" )
+  add_definitions( -UNDEBUG )
+  add_definitions(-DTEST_BENCHMARK_LIBRARY_HAS_NO_ASSERTIONS)
+  # Also remove /D NDEBUG to avoid MSVC warnings about conflicting defines.
+  foreach (flags_var_to_scrub
+      CMAKE_CXX_FLAGS_RELEASE
+      CMAKE_CXX_FLAGS_RELWITHDEBINFO
+      CMAKE_CXX_FLAGS_MINSIZEREL
+      CMAKE_C_FLAGS_RELEASE
+      CMAKE_C_FLAGS_RELWITHDEBINFO
+      CMAKE_C_FLAGS_MINSIZEREL)
+    string (REGEX REPLACE "(^| )[/-]D *NDEBUG($| )" " "
+      "${flags_var_to_scrub}" "${${flags_var_to_scrub}}")
+  endforeach()
+endif()
 
 # NOTE: These flags must be added after find_package(Threads REQUIRED) otherwise
 # they will break the configuration check.
@@ -8,7 +28,7 @@
   list(APPEND CMAKE_EXE_LINKER_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS})
 endif()
 
-add_library(output_test_helper STATIC output_test_helper.cc)
+add_library(output_test_helper STATIC output_test_helper.cc output_test.h)
 
 macro(compile_benchmark_test name)
   add_executable(${name} "${name}.cc")
@@ -56,6 +76,11 @@
 add_test(skip_with_error_test skip_with_error_test --benchmark_min_time=0.01)
 
 compile_benchmark_test(donotoptimize_test)
+# Some of the issues with DoNotOptimize only occur when optimization is enabled
+check_cxx_compiler_flag(-O3 BENCHMARK_HAS_O3_FLAG)
+if (BENCHMARK_HAS_O3_FLAG)
+  set_target_properties(donotoptimize_test PROPERTIES COMPILE_FLAGS "-O3")
+endif()
 add_test(donotoptimize_test donotoptimize_test --benchmark_min_time=0.01)
 
 compile_benchmark_test(fixture_test)
@@ -73,6 +98,12 @@
 compile_output_test(reporter_output_test)
 add_test(reporter_output_test reporter_output_test --benchmark_min_time=0.01)
 
+compile_output_test(user_counters_test)
+add_test(user_counters_test user_counters_test --benchmark_min_time=0.01)
+
+compile_output_test(user_counters_tabular_test)
+add_test(user_counters_tabular_test user_counters_tabular_test --benchmark_counters_tabular=true --benchmark_min_time=0.01)
+
 check_cxx_compiler_flag(-std=c++03 BENCHMARK_HAS_CXX03_FLAG)
 if (BENCHMARK_HAS_CXX03_FLAG)
   set(CXX03_FLAGS "${CMAKE_CXX_FLAGS}")
diff --git a/test/benchmark_test.cc b/test/benchmark_test.cc
index d832f81..7a16466 100644
--- a/test/benchmark_test.cc
+++ b/test/benchmark_test.cc
@@ -150,7 +150,7 @@
 BENCHMARK(BM_LongTest)->Range(1 << 16, 1 << 28);
 
 static void BM_ParallelMemset(benchmark::State& state) {
-  int size = state.range(0) / sizeof(int);
+  int size = state.range(0) / static_cast<int>(sizeof(int));
   int thread_size = size / state.threads;
   int from = thread_size * state.thread_index;
   int to = from + thread_size;
@@ -209,8 +209,7 @@
                   std::pair<int, double>(42, 3.8));
 
 void BM_non_template_args(benchmark::State& state, int, double) {
-  while (state.KeepRunning()) {
-  }
+  while(state.KeepRunning()) {}
 }
 BENCHMARK_CAPTURE(BM_non_template_args, basic_test, 0, 0);
 
diff --git a/test/complexity_test.cc b/test/complexity_test.cc
index 14e03b0..62d1154 100644
--- a/test/complexity_test.cc
+++ b/test/complexity_test.cc
@@ -141,7 +141,7 @@
 BENCHMARK(BM_Complexity_O_N_log_N)
     ->RangeMultiplier(2)
     ->Range(1 << 10, 1 << 16)
-    ->Complexity([](int n) { return n * std::log2(n); });
+    ->Complexity([](int n) { return n * log2(n); });
 BENCHMARK(BM_Complexity_O_N_log_N)
     ->RangeMultiplier(2)
     ->Range(1 << 10, 1 << 16)
diff --git a/test/cxx03_test.cc b/test/cxx03_test.cc
index 4f3d0fb..a79d964 100644
--- a/test/cxx03_test.cc
+++ b/test/cxx03_test.cc
@@ -39,4 +39,10 @@
 BENCHMARK_TEMPLATE(BM_template1, long);
 BENCHMARK_TEMPLATE1(BM_template1, int);
 
+void BM_counters(benchmark::State& state) {
+    BM_empty(state);
+    state.counters["Foo"] = 2;
+}
+BENCHMARK(BM_counters);
+
 BENCHMARK_MAIN()
diff --git a/test/diagnostics_test.cc b/test/diagnostics_test.cc
index c6c235d..1046730 100644
--- a/test/diagnostics_test.cc
+++ b/test/diagnostics_test.cc
@@ -26,7 +26,7 @@
 }
 
 void try_invalid_pause_resume(benchmark::State& state) {
-#if !defined(NDEBUG) && !defined(TEST_HAS_NO_EXCEPTIONS)
+#if !defined(TEST_BENCHMARK_LIBRARY_HAS_NO_ASSERTIONS) && !defined(TEST_HAS_NO_EXCEPTIONS)
   try {
     state.PauseTiming();
     std::abort();
diff --git a/test/donotoptimize_test.cc b/test/donotoptimize_test.cc
index b21187a..a705654 100644
--- a/test/donotoptimize_test.cc
+++ b/test/donotoptimize_test.cc
@@ -9,6 +9,22 @@
 std::uint64_t double_up(const std::uint64_t x) { return x * 2; }
 }
 
+// Using DoNotOptimize on types like BitRef seem to cause a lot of problems
+// with the inline assembly on both GCC and Clang.
+struct BitRef {
+  int index;
+  unsigned char &byte;
+
+public:
+  static BitRef Make() {
+    static unsigned char arr[2] = {};
+    BitRef b(1, arr[0]);
+    return b;
+  }
+private:
+  BitRef(int i, unsigned char& b) : index(i), byte(b) {}
+};
+
 int main(int, char*[]) {
   // this test verifies compilation of DoNotOptimize() for some types
 
@@ -29,5 +45,8 @@
 
   benchmark::DoNotOptimize(double_up(x));
 
-  return 0;
+  // These tests are to e
+  benchmark::DoNotOptimize(BitRef::Make());
+  BitRef lval = BitRef::Make();
+  benchmark::DoNotOptimize(lval);
 }
diff --git a/test/options_test.cc b/test/options_test.cc
index bedb1cc..bbbed28 100644
--- a/test/options_test.cc
+++ b/test/options_test.cc
@@ -1,8 +1,12 @@
 #include "benchmark/benchmark_api.h"
-
 #include <chrono>
 #include <thread>
 
+#if defined(NDEBUG)
+#undef NDEBUG
+#endif
+#include <cassert>
+
 void BM_basic(benchmark::State& state) {
   while (state.KeepRunning()) {
   }
@@ -40,4 +44,22 @@
 
 BENCHMARK(BM_basic)->Apply(CustomArgs);
 
+void BM_explicit_iteration_count(benchmark::State& st) {
+  // Test that benchmarks specified with an explicit iteration count are
+  // only run once.
+  static bool invoked_before = false;
+  assert(!invoked_before);
+  invoked_before = true;
+
+  // Test that the requested iteration count is respected.
+  assert(st.max_iterations == 42);
+  size_t actual_iterations = 0;
+  while (st.KeepRunning())
+    ++actual_iterations;
+  assert(st.iterations() == st.max_iterations);
+  assert(st.iterations() == 42);
+
+}
+BENCHMARK(BM_explicit_iteration_count)->Iterations(42);
+
 BENCHMARK_MAIN()
diff --git a/test/output_test.h b/test/output_test.h
index 57d4397..897a138 100644
--- a/test/output_test.h
+++ b/test/output_test.h
@@ -7,6 +7,8 @@
 #include <string>
 #include <utility>
 #include <vector>
+#include <functional>
+#include <sstream>
 
 #include "../src/re.h"
 #include "benchmark/benchmark.h"
@@ -59,6 +61,134 @@
 void RunOutputTests(int argc, char* argv[]);
 
 // ========================================================================= //
+// ------------------------- Results checking ------------------------------ //
+// ========================================================================= //
+
+// Call this macro to register a benchmark for checking its results. This
+// should be all that's needed. It subscribes a function to check the (CSV)
+// results of a benchmark. This is done only after verifying that the output
+// strings are really as expected.
+// bm_name_pattern: a name or a regex pattern which will be matched against
+//                  all the benchmark names. Matching benchmarks
+//                  will be the subject of a call to checker_function
+// checker_function: should be of type ResultsCheckFn (see below)
+#define CHECK_BENCHMARK_RESULTS(bm_name_pattern, checker_function) \
+    size_t CONCAT(dummy, __LINE__) = AddChecker(bm_name_pattern, checker_function)
+
+struct Results;
+typedef std::function< void(Results const&) > ResultsCheckFn;
+
+size_t AddChecker(const char* bm_name_pattern, ResultsCheckFn fn);
+
+// Class holding the results of a benchmark.
+// It is passed in calls to checker functions.
+struct Results {
+
+  // the benchmark name
+  std::string name;
+  // the benchmark fields
+  std::map< std::string, std::string > values;
+
+  Results(const std::string& n) : name(n) {}
+
+  int NumThreads() const;
+
+  typedef enum { kCpuTime, kRealTime } BenchmarkTime;
+
+  // get cpu_time or real_time in seconds
+  double GetTime(BenchmarkTime which) const;
+
+  // get the real_time duration of the benchmark in seconds.
+  // it is better to use fuzzy float checks for this, as the float
+  // ASCII formatting is lossy.
+  double DurationRealTime() const {
+    return GetAs< double >("iterations") * GetTime(kRealTime);
+  }
+  // get the cpu_time duration of the benchmark in seconds
+  double DurationCPUTime() const {
+    return GetAs< double >("iterations") * GetTime(kCpuTime);
+  }
+
+  // get the string for a result by name, or nullptr if the name
+  // is not found
+  const std::string* Get(const char* entry_name) const {
+    auto it = values.find(entry_name);
+    if(it == values.end()) return nullptr;
+    return &it->second;
+  }
+
+  // get a result by name, parsed as a specific type.
+  // NOTE: for counters, use GetCounterAs instead.
+  template <class T>
+  T GetAs(const char* entry_name) const;
+
+  // counters are written as doubles, so they have to be read first
+  // as a double, and only then converted to the asked type.
+  template <class T>
+  T GetCounterAs(const char* entry_name) const {
+    double dval = GetAs< double >(entry_name);
+    T tval = static_cast< T >(dval);
+    return tval;
+  }
+};
+
+template <class T>
+T Results::GetAs(const char* entry_name) const {
+  auto *sv = Get(entry_name);
+  CHECK(sv != nullptr && !sv->empty());
+  std::stringstream ss;
+  ss << *sv;
+  T out;
+  ss >> out;
+  CHECK(!ss.fail());
+  return out;
+}
+
+//----------------------------------
+// Macros to help in result checking. Do not use them with arguments causing
+// side-effects.
+
+#define _CHECK_RESULT_VALUE(entry, getfn, var_type, var_name, relationship, value) \
+    CONCAT(CHECK_, relationship)                                        \
+    (entry.getfn< var_type >(var_name), (value)) << "\n"                \
+    << __FILE__ << ":" << __LINE__ << ": " << (entry).name << ":\n"     \
+    << __FILE__ << ":" << __LINE__ << ": "                              \
+    << "expected (" << #var_type << ")" << (var_name)                   \
+    << "=" << (entry).getfn< var_type >(var_name)                       \
+    << " to be " #relationship " to " << (value) << "\n"
+
+// check with tolerance. eps_factor is the tolerance window, which is
+// interpreted relative to value (eg, 0.1 means 10% of value).
+#define _CHECK_FLOAT_RESULT_VALUE(entry, getfn, var_type, var_name, relationship, value, eps_factor) \
+    CONCAT(CHECK_FLOAT_, relationship)                                  \
+    (entry.getfn< var_type >(var_name), (value), (eps_factor) * (value)) << "\n" \
+    << __FILE__ << ":" << __LINE__ << ": " << (entry).name << ":\n"     \
+    << __FILE__ << ":" << __LINE__ << ": "                              \
+    << "expected (" << #var_type << ")" << (var_name)                   \
+    << "=" << (entry).getfn< var_type >(var_name)                       \
+    << " to be " #relationship " to " << (value) << "\n"                \
+    << __FILE__ << ":" << __LINE__ << ": "                              \
+    << "with tolerance of " << (eps_factor) * (value)                   \
+    << " (" << (eps_factor)*100. << "%), "                              \
+    << "but delta was " << ((entry).getfn< var_type >(var_name) - (value)) \
+    << " (" << (((entry).getfn< var_type >(var_name) - (value))         \
+               /                                                        \
+               ((value) > 1.e-5 || value < -1.e-5 ? value : 1.e-5)*100.) \
+    << "%)"
+
+#define CHECK_RESULT_VALUE(entry, var_type, var_name, relationship, value) \
+    _CHECK_RESULT_VALUE(entry, GetAs, var_type, var_name, relationship, value)
+
+#define CHECK_COUNTER_VALUE(entry, var_type, var_name, relationship, value) \
+    _CHECK_RESULT_VALUE(entry, GetCounterAs, var_type, var_name, relationship, value)
+
+#define CHECK_FLOAT_RESULT_VALUE(entry, var_name, relationship, value, eps_factor) \
+    _CHECK_FLOAT_RESULT_VALUE(entry, GetAs, double, var_name, relationship, value, eps_factor)
+
+#define CHECK_FLOAT_COUNTER_VALUE(entry, var_name, relationship, value, eps_factor) \
+    _CHECK_FLOAT_RESULT_VALUE(entry, GetCounterAs, double, var_name, relationship, value, eps_factor)
+
+// ========================================================================= //
 // --------------------------- Misc Utilities ------------------------------ //
 // ========================================================================= //
 
diff --git a/test/output_test_helper.cc b/test/output_test_helper.cc
index 721d39f..24746f6 100644
--- a/test/output_test_helper.cc
+++ b/test/output_test_helper.cc
@@ -2,10 +2,12 @@
 #include <map>
 #include <memory>
 #include <sstream>
+#include <cstring>
 
 #include "../src/check.h"  // NOTE: check.h is for internal use only!
 #include "../src/re.h"     // NOTE: re.h is for internal use only
 #include "output_test.h"
+#include "../src/benchmark_api_internal.h"
 
 // ========================================================================= //
 // ------------------------------ Internals -------------------------------- //
@@ -31,21 +33,29 @@
 
 SubMap& GetSubstitutions() {
   // Don't use 'dec_re' from header because it may not yet be initialized.
-  static std::string dec_re = "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?";
+  static std::string safe_dec_re = "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?";
   static SubMap map = {
       {"%float", "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?"},
+      // human-readable float
+      {"%hrfloat", "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?[kMGTPEZYmunpfazy]?"},
       {"%int", "[ ]*[0-9]+"},
       {" %s ", "[ ]+"},
       {"%time", "[ ]*[0-9]{1,5} ns"},
       {"%console_report", "[ ]*[0-9]{1,5} ns [ ]*[0-9]{1,5} ns [ ]*[0-9]+"},
       {"%console_us_report", "[ ]*[0-9] us [ ]*[0-9] us [ ]*[0-9]+"},
-      {"%csv_report", "[0-9]+," + dec_re + "," + dec_re + ",ns,,,,,"},
-      {"%csv_us_report", "[0-9]+," + dec_re + "," + dec_re + ",us,,,,,"},
+      {"%csv_header",
+       "name,iterations,real_time,cpu_time,time_unit,bytes_per_second,"
+       "items_per_second,label,error_occurred,error_message"},
+      {"%csv_report", "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ns,,,,,"},
+      {"%csv_us_report", "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",us,,,,,"},
       {"%csv_bytes_report",
-       "[0-9]+," + dec_re + "," + dec_re + ",ns," + dec_re + ",,,,"},
+       "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ns," + safe_dec_re + ",,,,"},
       {"%csv_items_report",
-       "[0-9]+," + dec_re + "," + dec_re + ",ns,," + dec_re + ",,,"},
-      {"%csv_label_report_begin", "[0-9]+," + dec_re + "," + dec_re + ",ns,,,"},
+       "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ns,," + safe_dec_re + ",,,"},
+      {"%csv_bytes_items_report",
+       "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ns," + safe_dec_re +
+       "," + safe_dec_re + ",,,"},
+      {"%csv_label_report_begin", "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ns,,,"},
       {"%csv_label_report_end", ",,"}};
   return map;
 }
@@ -140,9 +150,180 @@
   std::vector<benchmark::BenchmarkReporter *> reporters_;
 };
 }
+
 }  // end namespace internal
 
 // ========================================================================= //
+// -------------------------- Results checking ----------------------------- //
+// ========================================================================= //
+
+namespace internal {
+
+// Utility class to manage subscribers for checking benchmark results.
+// It works by parsing the CSV output to read the results.
+class ResultsChecker {
+ public:
+
+  struct PatternAndFn : public TestCase { // reusing TestCase for its regexes
+    PatternAndFn(const std::string& rx, ResultsCheckFn fn_)
+    : TestCase(rx), fn(fn_) {}
+    ResultsCheckFn fn;
+  };
+
+  std::vector< PatternAndFn > check_patterns;
+  std::vector< Results > results;
+  std::vector< std::string > field_names;
+
+  void Add(const std::string& entry_pattern, ResultsCheckFn fn);
+
+  void CheckResults(std::stringstream& output);
+
+ private:
+
+  void SetHeader_(const std::string& csv_header);
+  void SetValues_(const std::string& entry_csv_line);
+
+  std::vector< std::string > SplitCsv_(const std::string& line);
+
+};
+
+// store the static ResultsChecker in a function to prevent initialization
+// order problems
+ResultsChecker& GetResultsChecker() {
+  static ResultsChecker rc;
+  return rc;
+}
+
+// add a results checker for a benchmark
+void ResultsChecker::Add(const std::string& entry_pattern, ResultsCheckFn fn) {
+  check_patterns.emplace_back(entry_pattern, fn);
+}
+
+// check the results of all subscribed benchmarks
+void ResultsChecker::CheckResults(std::stringstream& output) {
+  // first reset the stream to the start
+  {
+    auto start = std::ios::streampos(0);
+    // clear before calling tellg()
+    output.clear();
+    // seek to zero only when needed
+    if(output.tellg() > start) output.seekg(start);
+    // and just in case
+    output.clear();
+  }
+  // now go over every line and publish it to the ResultsChecker
+  std::string line;
+  bool on_first = true;
+  while (output.eof() == false) {
+    CHECK(output.good());
+    std::getline(output, line);
+    if (on_first) {
+      SetHeader_(line); // this is important
+      on_first = false;
+      continue;
+    }
+    SetValues_(line);
+  }
+  // finally we can call the subscribed check functions
+  for(const auto& p : check_patterns) {
+    VLOG(2) << "--------------------------------\n";
+    VLOG(2) << "checking for benchmarks matching " << p.regex_str << "...\n";
+    for(const auto& r : results) {
+      if(!p.regex->Match(r.name)) {
+        VLOG(2) << p.regex_str << " is not matched by " << r.name << "\n";
+        continue;
+      } else {
+        VLOG(2) << p.regex_str << " is matched by " << r.name << "\n";
+      }
+      VLOG(1) << "Checking results of " << r.name << ": ... \n";
+      p.fn(r);
+      VLOG(1) << "Checking results of " << r.name << ": OK.\n";
+    }
+  }
+}
+
+// prepare for the names in this header
+void ResultsChecker::SetHeader_(const std::string& csv_header) {
+  field_names = SplitCsv_(csv_header);
+}
+
+// set the values for a benchmark
+void ResultsChecker::SetValues_(const std::string& entry_csv_line) {
+  if(entry_csv_line.empty()) return; // some lines are empty
+  CHECK(!field_names.empty());
+  auto vals = SplitCsv_(entry_csv_line);
+  CHECK_EQ(vals.size(), field_names.size());
+  results.emplace_back(vals[0]); // vals[0] is the benchmark name
+  auto &entry = results.back();
+  for (size_t i = 1, e = vals.size(); i < e; ++i) {
+    entry.values[field_names[i]] = vals[i];
+  }
+}
+
+// a quick'n'dirty csv splitter (eliminating quotes)
+std::vector< std::string > ResultsChecker::SplitCsv_(const std::string& line) {
+  std::vector< std::string > out;
+  if(line.empty()) return out;
+  if(!field_names.empty()) out.reserve(field_names.size());
+  size_t prev = 0, pos = line.find_first_of(','), curr = pos;
+  while(pos != line.npos) {
+    CHECK(curr > 0);
+    if(line[prev] == '"') ++prev;
+    if(line[curr-1] == '"') --curr;
+    out.push_back(line.substr(prev, curr-prev));
+    prev = pos + 1;
+    pos = line.find_first_of(',', pos + 1);
+    curr = pos;
+  }
+  curr = line.size();
+  if(line[prev] == '"') ++prev;
+  if(line[curr-1] == '"') --curr;
+  out.push_back(line.substr(prev, curr-prev));
+  return out;
+}
+
+}  // end namespace internal
+
+size_t AddChecker(const char* bm_name, ResultsCheckFn fn)
+{
+  auto &rc = internal::GetResultsChecker();
+  rc.Add(bm_name, fn);
+  return rc.results.size();
+}
+
+int Results::NumThreads() const {
+  auto pos = name.find("/threads:");
+  if(pos == name.npos) return 1;
+  auto end = name.find('/', pos + 9);
+  std::stringstream ss;
+  ss << name.substr(pos + 9, end);
+  int num = 1;
+  ss >> num;
+  CHECK(!ss.fail());
+  return num;
+}
+
+double Results::GetTime(BenchmarkTime which) const {
+  CHECK(which == kCpuTime || which == kRealTime);
+  const char *which_str = which == kCpuTime ? "cpu_time" : "real_time";
+  double val = GetAs< double >(which_str);
+  auto unit = Get("time_unit");
+  CHECK(unit);
+  if(*unit == "ns") {
+    return val * 1.e-9;
+  } else if(*unit == "us") {
+    return val * 1.e-6;
+  } else if(*unit == "ms") {
+    return val * 1.e-3;
+  } else if(*unit == "s") {
+    return val;
+  } else {
+    CHECK(1 == 0) << "unknown time unit: " << *unit;
+    return 0;
+  }
+}
+
+// ========================================================================= //
 // -------------------------- Public API Definitions------------------------ //
 // ========================================================================= //
 
@@ -186,7 +367,8 @@
 void RunOutputTests(int argc, char* argv[]) {
   using internal::GetTestCaseList;
   benchmark::Initialize(&argc, argv);
-  benchmark::ConsoleReporter CR(benchmark::ConsoleReporter::OO_None);
+  auto options = benchmark::internal::GetOutputOptions(/*force_no_color*/true);
+  benchmark::ConsoleReporter CR(options);
   benchmark::JSONReporter JR;
   benchmark::CSVReporter CSVR;
   struct ReporterTest {
@@ -231,4 +413,11 @@
 
     std::cout << "\n";
   }
+
+  // now that we know the output is as expected, we can dispatch
+  // the checks to subscribees.
+  auto &csv = TestCases[2];
+  // would use == but gcc spits a warning
+  CHECK(std::strcmp(csv.name, "CSVReporter") == 0);
+  internal::GetResultsChecker().CheckResults(csv.out_stream);
 }
diff --git a/test/register_benchmark_test.cc b/test/register_benchmark_test.cc
index e9f8ea5..2769b7a 100644
--- a/test/register_benchmark_test.cc
+++ b/test/register_benchmark_test.cc
@@ -114,23 +114,23 @@
 #endif
 #ifndef BENCHMARK_HAS_NO_VARIADIC_REGISTER_BENCHMARK
   {
-    int x = 42;
+    const char* x = "42";
     auto capturing_lam = [=](benchmark::State& st) {
       while (st.KeepRunning()) {
       }
-      st.SetLabel(std::to_string(x));
+      st.SetLabel(x);
     };
     benchmark::RegisterBenchmark("lambda_benchmark", capturing_lam);
-    AddCases({{"lambda_benchmark", "42"}});
+    AddCases({{"lambda_benchmark", x}});
   }
 #endif
 }
 
-int main(int argc, char* argv[]) {
+// Test that all benchmarks, registered at either during static init or runtime,
+// are run and the results are passed to the reported.
+void RunTestOne() {
   TestRegistrationAtRuntime();
 
-  benchmark::Initialize(&argc, argv);
-
   TestReporter test_reporter;
   benchmark::RunSpecifiedBenchmarks(&test_reporter);
 
@@ -143,6 +143,40 @@
     ++EB;
   }
   assert(EB == ExpectedResults.end());
+}
 
-  return 0;
+// Test that ClearRegisteredBenchmarks() clears all previously registered
+// benchmarks.
+// Also test that new benchmarks can be registered and ran afterwards.
+void RunTestTwo() {
+  assert(ExpectedResults.size() != 0 &&
+         "must have at least one registered benchmark");
+  ExpectedResults.clear();
+  benchmark::ClearRegisteredBenchmarks();
+
+  TestReporter test_reporter;
+  size_t num_ran = benchmark::RunSpecifiedBenchmarks(&test_reporter);
+  assert(num_ran == 0);
+  assert(test_reporter.all_runs_.begin() == test_reporter.all_runs_.end());
+
+  TestRegistrationAtRuntime();
+  num_ran = benchmark::RunSpecifiedBenchmarks(&test_reporter);
+  assert(num_ran == ExpectedResults.size());
+
+  typedef benchmark::BenchmarkReporter::Run Run;
+  auto EB = ExpectedResults.begin();
+
+  for (Run const& run : test_reporter.all_runs_) {
+    assert(EB != ExpectedResults.end());
+    EB->CheckRun(run);
+    ++EB;
+  }
+  assert(EB == ExpectedResults.end());
+}
+
+int main(int argc, char* argv[]) {
+  benchmark::Initialize(&argc, argv);
+
+  RunTestOne();
+  RunTestTwo();
 }
diff --git a/test/reporter_output_test.cc b/test/reporter_output_test.cc
index 2e6d2b2..4a48143 100644
--- a/test/reporter_output_test.cc
+++ b/test/reporter_output_test.cc
@@ -9,11 +9,11 @@
 // ---------------------- Testing Prologue Output -------------------------- //
 // ========================================================================= //
 
-ADD_CASES(TC_ConsoleOut, {{"^Benchmark %s Time %s CPU %s Iterations$", MR_Next},
-                          {"^[-]+$", MR_Next}});
-ADD_CASES(TC_CSVOut,
-          {{"name,iterations,real_time,cpu_time,time_unit,bytes_per_second,"
-            "items_per_second,label,error_occurred,error_message"}});
+ADD_CASES(TC_ConsoleOut,
+          {{"^[-]+$", MR_Next},
+           {"^Benchmark %s Time %s CPU %s Iterations$", MR_Next},
+           {"^[-]+$", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"%csv_header"}});
 
 // ========================================================================= //
 // ------------------------ Testing Basic Output --------------------------- //
diff --git a/test/user_counters_tabular_test.cc b/test/user_counters_tabular_test.cc
new file mode 100644
index 0000000..5fc5b4d
--- /dev/null
+++ b/test/user_counters_tabular_test.cc
@@ -0,0 +1,250 @@
+
+#undef NDEBUG
+
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+// @todo: <jpmag> this checks the full output at once; the rule for
+// CounterSet1 was failing because it was not matching "^[-]+$".
+// @todo: <jpmag> check that the counters are vertically aligned.
+ADD_CASES(TC_ConsoleOut, {
+// keeping these lines long improves readability, so:
+// clang-format off
+    {"^[-]+$", MR_Next},
+    {"^Benchmark %s Time %s CPU %s Iterations %s Bar %s Bat %s Baz %s Foo %s Frob %s Lob$", MR_Next},
+    {"^[-]+$", MR_Next},
+    {"^BM_Counters_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_Counters_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_Counters_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_Counters_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_Counters_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next},
+    {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next},
+    {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next},
+    {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next},
+    {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next},
+    {"^[-]+$", MR_Next},
+    {"^Benchmark %s Time %s CPU %s Iterations %s Bar %s Baz %s Foo$", MR_Next},
+    {"^[-]+$", MR_Next},
+    {"^BM_CounterSet0_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet0_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet0_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet0_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet0_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet1_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet1_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet1_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet1_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet1_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^[-]+$", MR_Next},
+    {"^Benchmark %s Time %s CPU %s Iterations %s Bat %s Baz %s Foo$", MR_Next},
+    {"^[-]+$", MR_Next},
+    {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$"},
+// clang-format on
+});
+ADD_CASES(TC_CSVOut, {{"%csv_header,"
+                       "\"Bar\",\"Bat\",\"Baz\",\"Foo\",\"Frob\",\"Lob\""}});
+
+// ========================================================================= //
+// ------------------------- Tabular Counters Output ----------------------- //
+// ========================================================================= //
+
+void BM_Counters_Tabular(benchmark::State& state) {
+  while (state.KeepRunning()) {
+  }
+  namespace bm = benchmark;
+  state.counters.insert({
+    {"Foo",  { 1, bm::Counter::kAvgThreads}},
+    {"Bar",  { 2, bm::Counter::kAvgThreads}},
+    {"Baz",  { 4, bm::Counter::kAvgThreads}},
+    {"Bat",  { 8, bm::Counter::kAvgThreads}},
+    {"Frob", {16, bm::Counter::kAvgThreads}},
+    {"Lob",  {32, bm::Counter::kAvgThreads}},
+  });
+}
+BENCHMARK(BM_Counters_Tabular)->ThreadRange(1, 16);
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_Tabular/threads:%int\",$"},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %int,$", MR_Next},
+                       {"\"cpu_time\": %int,$", MR_Next},
+                       {"\"time_unit\": \"ns\",$", MR_Next},
+                       {"\"Bar\": %float,$", MR_Next},
+                       {"\"Bat\": %float,$", MR_Next},
+                       {"\"Baz\": %float,$", MR_Next},
+                       {"\"Foo\": %float,$", MR_Next},
+                       {"\"Frob\": %float,$", MR_Next},
+                       {"\"Lob\": %float$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_Tabular/threads:%int\",%csv_report,"
+                       "%float,%float,%float,%float,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckTabular(Results const& e) {
+  CHECK_COUNTER_VALUE(e, int, "Foo", EQ, 1);
+  CHECK_COUNTER_VALUE(e, int, "Bar", EQ, 2);
+  CHECK_COUNTER_VALUE(e, int, "Baz", EQ, 4);
+  CHECK_COUNTER_VALUE(e, int, "Bat", EQ, 8);
+  CHECK_COUNTER_VALUE(e, int, "Frob", EQ, 16);
+  CHECK_COUNTER_VALUE(e, int, "Lob", EQ, 32);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_Tabular/threads:%int", &CheckTabular);
+
+// ========================================================================= //
+// -------------------- Tabular+Rate Counters Output ----------------------- //
+// ========================================================================= //
+
+void BM_CounterRates_Tabular(benchmark::State& state) {
+  while (state.KeepRunning()) {
+  }
+  namespace bm = benchmark;
+  state.counters.insert({
+    {"Foo",  { 1, bm::Counter::kAvgThreadsRate}},
+    {"Bar",  { 2, bm::Counter::kAvgThreadsRate}},
+    {"Baz",  { 4, bm::Counter::kAvgThreadsRate}},
+    {"Bat",  { 8, bm::Counter::kAvgThreadsRate}},
+    {"Frob", {16, bm::Counter::kAvgThreadsRate}},
+    {"Lob",  {32, bm::Counter::kAvgThreadsRate}},
+  });
+}
+BENCHMARK(BM_CounterRates_Tabular)->ThreadRange(1, 16);
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_CounterRates_Tabular/threads:%int\",$"},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %int,$", MR_Next},
+                       {"\"cpu_time\": %int,$", MR_Next},
+                       {"\"time_unit\": \"ns\",$", MR_Next},
+                       {"\"Bar\": %float,$", MR_Next},
+                       {"\"Bat\": %float,$", MR_Next},
+                       {"\"Baz\": %float,$", MR_Next},
+                       {"\"Foo\": %float,$", MR_Next},
+                       {"\"Frob\": %float,$", MR_Next},
+                       {"\"Lob\": %float$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_CounterRates_Tabular/threads:%int\",%csv_report,"
+                       "%float,%float,%float,%float,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckTabularRate(Results const& e) {
+  double t = e.DurationCPUTime();
+  CHECK_FLOAT_COUNTER_VALUE(e, "Foo", EQ, 1./t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "Bar", EQ, 2./t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "Baz", EQ, 4./t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "Bat", EQ, 8./t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "Frob", EQ, 16./t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "Lob", EQ, 32./t, 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_CounterRates_Tabular/threads:%int",
+                        &CheckTabularRate);
+
+// ========================================================================= //
+// ------------------------- Tabular Counters Output ----------------------- //
+// ========================================================================= //
+
+// set only some of the counters
+void BM_CounterSet0_Tabular(benchmark::State& state) {
+  while (state.KeepRunning()) {
+  }
+  namespace bm = benchmark;
+  state.counters.insert({
+    {"Foo", {10, bm::Counter::kAvgThreads}},
+    {"Bar", {20, bm::Counter::kAvgThreads}},
+    {"Baz", {40, bm::Counter::kAvgThreads}},
+  });
+}
+BENCHMARK(BM_CounterSet0_Tabular)->ThreadRange(1, 16);
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_CounterSet0_Tabular/threads:%int\",$"},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %int,$", MR_Next},
+                       {"\"cpu_time\": %int,$", MR_Next},
+                       {"\"time_unit\": \"ns\",$", MR_Next},
+                       {"\"Bar\": %float,$", MR_Next},
+                       {"\"Baz\": %float,$", MR_Next},
+                       {"\"Foo\": %float$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_CounterSet0_Tabular/threads:%int\",%csv_report,"
+                       "%float,,%float,%float,,"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckSet0(Results const& e) {
+  CHECK_COUNTER_VALUE(e, int, "Foo", EQ, 10);
+  CHECK_COUNTER_VALUE(e, int, "Bar", EQ, 20);
+  CHECK_COUNTER_VALUE(e, int, "Baz", EQ, 40);
+}
+CHECK_BENCHMARK_RESULTS("BM_CounterSet0_Tabular", &CheckSet0);
+
+// again.
+void BM_CounterSet1_Tabular(benchmark::State& state) {
+  while (state.KeepRunning()) {
+  }
+  namespace bm = benchmark;
+  state.counters.insert({
+    {"Foo", {15, bm::Counter::kAvgThreads}},
+    {"Bar", {25, bm::Counter::kAvgThreads}},
+    {"Baz", {45, bm::Counter::kAvgThreads}},
+  });
+}
+BENCHMARK(BM_CounterSet1_Tabular)->ThreadRange(1, 16);
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_CounterSet1_Tabular/threads:%int\",$"},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %int,$", MR_Next},
+                       {"\"cpu_time\": %int,$", MR_Next},
+                       {"\"time_unit\": \"ns\",$", MR_Next},
+                       {"\"Bar\": %float,$", MR_Next},
+                       {"\"Baz\": %float,$", MR_Next},
+                       {"\"Foo\": %float$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_CounterSet1_Tabular/threads:%int\",%csv_report,"
+                       "%float,,%float,%float,,"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckSet1(Results const& e) {
+  CHECK_COUNTER_VALUE(e, int, "Foo", EQ, 15);
+  CHECK_COUNTER_VALUE(e, int, "Bar", EQ, 25);
+  CHECK_COUNTER_VALUE(e, int, "Baz", EQ, 45);
+}
+CHECK_BENCHMARK_RESULTS("BM_CounterSet1_Tabular/threads:%int", &CheckSet1);
+
+// ========================================================================= //
+// ------------------------- Tabular Counters Output ----------------------- //
+// ========================================================================= //
+
+// set only some of the counters, different set now.
+void BM_CounterSet2_Tabular(benchmark::State& state) {
+  while (state.KeepRunning()) {
+  }
+  namespace bm = benchmark;
+  state.counters.insert({
+    {"Foo", {10, bm::Counter::kAvgThreads}},
+    {"Bat", {30, bm::Counter::kAvgThreads}},
+    {"Baz", {40, bm::Counter::kAvgThreads}},
+  });
+}
+BENCHMARK(BM_CounterSet2_Tabular)->ThreadRange(1, 16);
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_CounterSet2_Tabular/threads:%int\",$"},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %int,$", MR_Next},
+                       {"\"cpu_time\": %int,$", MR_Next},
+                       {"\"time_unit\": \"ns\",$", MR_Next},
+                       {"\"Bat\": %float,$", MR_Next},
+                       {"\"Baz\": %float,$", MR_Next},
+                       {"\"Foo\": %float$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_CounterSet2_Tabular/threads:%int\",%csv_report,"
+                       ",%float,%float,%float,,"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckSet2(Results const& e) {
+  CHECK_COUNTER_VALUE(e, int, "Foo", EQ, 10);
+  CHECK_COUNTER_VALUE(e, int, "Bat", EQ, 30);
+  CHECK_COUNTER_VALUE(e, int, "Baz", EQ, 40);
+}
+CHECK_BENCHMARK_RESULTS("BM_CounterSet2_Tabular", &CheckSet2);
+
+// ========================================================================= //
+// --------------------------- TEST CASES END ------------------------------ //
+// ========================================================================= //
+
+int main(int argc, char* argv[]) { RunOutputTests(argc, argv); }
diff --git a/test/user_counters_test.cc b/test/user_counters_test.cc
new file mode 100644
index 0000000..66df48b
--- /dev/null
+++ b/test/user_counters_test.cc
@@ -0,0 +1,217 @@
+
+#undef NDEBUG
+
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+// ========================================================================= //
+// ---------------------- Testing Prologue Output -------------------------- //
+// ========================================================================= //
+
+ADD_CASES(TC_ConsoleOut,
+          {{"^[-]+$", MR_Next},
+           {"^Benchmark %s Time %s CPU %s Iterations UserCounters...$", MR_Next},
+           {"^[-]+$", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"%csv_header,\"bar\",\"foo\""}});
+
+// ========================================================================= //
+// ------------------------- Simple Counters Output ------------------------ //
+// ========================================================================= //
+
+void BM_Counters_Simple(benchmark::State& state) {
+  while (state.KeepRunning()) {
+  }
+  state.counters["foo"] = 1;
+  state.counters["bar"] = 2 * (double)state.iterations();
+}
+BENCHMARK(BM_Counters_Simple);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_Simple %console_report bar=%hrfloat foo=%hrfloat$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_Simple\",$"},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %int,$", MR_Next},
+                       {"\"cpu_time\": %int,$", MR_Next},
+                       {"\"time_unit\": \"ns\",$", MR_Next},
+                       {"\"bar\": %float,$", MR_Next},
+                       {"\"foo\": %float$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_Simple\",%csv_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckSimple(Results const& e) {
+  double its = e.GetAs< double >("iterations");
+  CHECK_COUNTER_VALUE(e, int, "foo", EQ, 1);
+  // check that the value of bar is within 0.1% of the expected value
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2.*its, 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_Simple", &CheckSimple);
+
+// ========================================================================= //
+// --------------------- Counters+Items+Bytes/s Output --------------------- //
+// ========================================================================= //
+
+namespace { int num_calls1 = 0; }
+void BM_Counters_WithBytesAndItemsPSec(benchmark::State& state) {
+  while (state.KeepRunning()) {
+  }
+  state.counters["foo"] = 1;
+  state.counters["bar"] = ++num_calls1;
+  state.SetBytesProcessed(364);
+  state.SetItemsProcessed(150);
+}
+BENCHMARK(BM_Counters_WithBytesAndItemsPSec);
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_Counters_WithBytesAndItemsPSec %console_report "
+            "bar=%hrfloat foo=%hrfloat +%hrfloatB/s +%hrfloat items/s$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_WithBytesAndItemsPSec\",$"},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %int,$", MR_Next},
+                       {"\"cpu_time\": %int,$", MR_Next},
+                       {"\"time_unit\": \"ns\",$", MR_Next},
+                       {"\"bytes_per_second\": %int,$", MR_Next},
+                       {"\"items_per_second\": %int,$", MR_Next},
+                       {"\"bar\": %float,$", MR_Next},
+                       {"\"foo\": %float$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_WithBytesAndItemsPSec\","
+                       "%csv_bytes_items_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckBytesAndItemsPSec(Results const& e) {
+  double t = e.DurationCPUTime(); // this (and not real time) is the time used
+  CHECK_COUNTER_VALUE(e, int, "foo", EQ, 1);
+  CHECK_COUNTER_VALUE(e, int, "bar", EQ, num_calls1);
+  // check that the values are within 0.1% of the expected values
+  CHECK_FLOAT_RESULT_VALUE(e, "bytes_per_second", EQ, 364./t, 0.001);
+  CHECK_FLOAT_RESULT_VALUE(e, "items_per_second", EQ, 150./t, 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_WithBytesAndItemsPSec",
+                        &CheckBytesAndItemsPSec);
+
+// ========================================================================= //
+// ------------------------- Rate Counters Output -------------------------- //
+// ========================================================================= //
+
+void BM_Counters_Rate(benchmark::State& state) {
+  while (state.KeepRunning()) {
+  }
+  namespace bm = benchmark;
+  state.counters["foo"] = bm::Counter{1, bm::Counter::kIsRate};
+  state.counters["bar"] = bm::Counter{2, bm::Counter::kIsRate};
+}
+BENCHMARK(BM_Counters_Rate);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_Rate %console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_Rate\",$"},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %int,$", MR_Next},
+                       {"\"cpu_time\": %int,$", MR_Next},
+                       {"\"time_unit\": \"ns\",$", MR_Next},
+                       {"\"bar\": %float,$", MR_Next},
+                       {"\"foo\": %float$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_Rate\",%csv_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckRate(Results const& e) {
+  double t = e.DurationCPUTime(); // this (and not real time) is the time used
+  // check that the values are within 0.1% of the expected values
+  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, 1./t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2./t, 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_Rate", &CheckRate);
+
+// ========================================================================= //
+// ------------------------- Thread Counters Output ------------------------ //
+// ========================================================================= //
+
+void BM_Counters_Threads(benchmark::State& state) {
+  while (state.KeepRunning()) {
+  }
+  state.counters["foo"] = 1;
+  state.counters["bar"] = 2;
+}
+BENCHMARK(BM_Counters_Threads)->ThreadRange(1, 8);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_Threads/threads:%int %console_report bar=%hrfloat foo=%hrfloat$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_Threads/threads:%int\",$"},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %int,$", MR_Next},
+                       {"\"cpu_time\": %int,$", MR_Next},
+                       {"\"time_unit\": \"ns\",$", MR_Next},
+                       {"\"bar\": %float,$", MR_Next},
+                       {"\"foo\": %float$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_Threads/threads:%int\",%csv_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckThreads(Results const& e) {
+  CHECK_COUNTER_VALUE(e, int, "foo", EQ, e.NumThreads());
+  CHECK_COUNTER_VALUE(e, int, "bar", EQ, 2 * e.NumThreads());
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_Threads/threads:%int", &CheckThreads);
+
+// ========================================================================= //
+// ---------------------- ThreadAvg Counters Output ------------------------ //
+// ========================================================================= //
+
+void BM_Counters_AvgThreads(benchmark::State& state) {
+  while (state.KeepRunning()) {
+  }
+  namespace bm = benchmark;
+  state.counters["foo"] = bm::Counter{1, bm::Counter::kAvgThreads};
+  state.counters["bar"] = bm::Counter{2, bm::Counter::kAvgThreads};
+}
+BENCHMARK(BM_Counters_AvgThreads)->ThreadRange(1, 8);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_AvgThreads/threads:%int %console_report bar=%hrfloat foo=%hrfloat$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_AvgThreads/threads:%int\",$"},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %int,$", MR_Next},
+                       {"\"cpu_time\": %int,$", MR_Next},
+                       {"\"time_unit\": \"ns\",$", MR_Next},
+                       {"\"bar\": %float,$", MR_Next},
+                       {"\"foo\": %float$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_AvgThreads/threads:%int\",%csv_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckAvgThreads(Results const& e) {
+  CHECK_COUNTER_VALUE(e, int, "foo", EQ, 1);
+  CHECK_COUNTER_VALUE(e, int, "bar", EQ, 2);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_AvgThreads/threads:%int",
+                        &CheckAvgThreads);
+
+// ========================================================================= //
+// ---------------------- ThreadAvg Counters Output ------------------------ //
+// ========================================================================= //
+
+void BM_Counters_AvgThreadsRate(benchmark::State& state) {
+  while (state.KeepRunning()) {
+  }
+  namespace bm = benchmark;
+  state.counters["foo"] = bm::Counter{1, bm::Counter::kAvgThreadsRate};
+  state.counters["bar"] = bm::Counter{2, bm::Counter::kAvgThreadsRate};
+}
+BENCHMARK(BM_Counters_AvgThreadsRate)->ThreadRange(1, 8);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_AvgThreadsRate/threads:%int %console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_AvgThreadsRate/threads:%int\",$"},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %int,$", MR_Next},
+                       {"\"cpu_time\": %int,$", MR_Next},
+                       {"\"time_unit\": \"ns\",$", MR_Next},
+                       {"\"bar\": %float,$", MR_Next},
+                       {"\"foo\": %float$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_AvgThreadsRate/threads:%int\",%csv_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckAvgThreadsRate(Results const& e) {
+  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, 1./e.DurationCPUTime(), 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2./e.DurationCPUTime(), 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_AvgThreadsRate/threads:%int",
+                        &CheckAvgThreadsRate);
+
+// ========================================================================= //
+// --------------------------- TEST CASES END ------------------------------ //
+// ========================================================================= //
+
+int main(int argc, char* argv[]) { RunOutputTests(argc, argv); }
diff --git a/tools/compare_bench.py b/tools/compare_bench.py
index ed0f133..d54baaa 100755
--- a/tools/compare_bench.py
+++ b/tools/compare_bench.py
@@ -3,25 +3,63 @@
 compare_bench.py - Compare two benchmarks or their results and report the
                    difference.
 """
+import argparse
+from argparse import ArgumentParser
 import sys
 import gbench
 from gbench import util, report
+from gbench.util import *
+
+def check_inputs(in1, in2, flags):
+    """
+    Perform checking on the user provided inputs and diagnose any abnormalities
+    """
+    in1_kind, in1_err = classify_input_file(in1)
+    in2_kind, in2_err = classify_input_file(in2)
+    output_file = find_benchmark_flag('--benchmark_out=', flags)
+    output_type = find_benchmark_flag('--benchmark_out_format=', flags)
+    if in1_kind == IT_Executable and in2_kind == IT_Executable and output_file:
+        print(("WARNING: '--benchmark_out=%s' will be passed to both "
+              "benchmarks causing it to be overwritten") % output_file)
+    if in1_kind == IT_JSON and in2_kind == IT_JSON and len(flags) > 0:
+        print("WARNING: passing --benchmark flags has no effect since both "
+              "inputs are JSON")
+    if output_type is not None and output_type != 'json':
+        print(("ERROR: passing '--benchmark_out_format=%s' to 'compare_bench.py`"
+              " is not supported.") % output_type)
+        sys.exit(1)
+
 
 def main():
+    parser = ArgumentParser(
+        description='compare the results of two benchmarks')
+    parser.add_argument(
+        'test1', metavar='test1', type=str, nargs=1,
+        help='A benchmark executable or JSON output file')
+    parser.add_argument(
+        'test2', metavar='test2', type=str, nargs=1,
+        help='A benchmark executable or JSON output file')
+    # FIXME this is a dummy argument which will never actually match
+    # any --benchmark flags but it helps generate a better usage message
+    parser.add_argument(
+        'benchmark_options', metavar='benchmark_option', nargs='*',
+        help='Arguments to pass when running benchmark executables'
+    )
+    args, unknown_args = parser.parse_known_args()
     # Parse the command line flags
-    def usage():
-        print('compare_bench.py <test1> <test2> [benchmark options]...')
+    test1 = args.test1[0]
+    test2 = args.test2[0]
+    if args.benchmark_options:
+        print("Unrecognized positional argument arguments: '%s'"
+              % args.benchmark_options)
         exit(1)
-    if '--help' in sys.argv or len(sys.argv) < 3:
-        usage()
-    tests = sys.argv[1:3]
-    bench_opts = sys.argv[3:]
-    bench_opts = list(bench_opts)
+    benchmark_options = unknown_args
+    check_inputs(test1, test2, benchmark_options)
     # Run the benchmarks and report the results
-    json1 = gbench.util.run_or_load_benchmark(tests[0], bench_opts)
-    json2 = gbench.util.run_or_load_benchmark(tests[1], bench_opts)
+    json1 = gbench.util.run_or_load_benchmark(test1, benchmark_options)
+    json2 = gbench.util.run_or_load_benchmark(test2, benchmark_options)
     output_lines = gbench.report.generate_difference_report(json1, json2)
-    print 'Comparing %s to %s' % (tests[0], tests[1])
+    print('Comparing %s to %s' % (test1, test2))
     for ln in output_lines:
         print(ln)
 
diff --git a/tools/gbench/Inputs/test1_run1.json b/tools/gbench/Inputs/test1_run1.json
index da9425e..37faed4 100644
--- a/tools/gbench/Inputs/test1_run1.json
+++ b/tools/gbench/Inputs/test1_run1.json
@@ -41,6 +41,20 @@
       "real_time": 100,
       "cpu_time": 100,
       "time_unit": "ns"
+    },
+    {
+      "name": "BM_100xSlower",
+      "iterations": 1000,
+      "real_time": 100,
+      "cpu_time": 100,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_100xFaster",
+      "iterations": 1000,
+      "real_time": 10000,
+      "cpu_time": 10000,
+      "time_unit": "ns"
     }
   ]
 }
\ No newline at end of file
diff --git a/tools/gbench/Inputs/test1_run2.json b/tools/gbench/Inputs/test1_run2.json
index d8bc72d..aed5151 100644
--- a/tools/gbench/Inputs/test1_run2.json
+++ b/tools/gbench/Inputs/test1_run2.json
@@ -41,6 +41,20 @@
       "real_time": 110,
       "cpu_time": 110,
       "time_unit": "ns"
+    },
+    {
+      "name": "BM_100xSlower",
+      "iterations": 1000,
+      "real_time": 10000,
+      "cpu_time": 10000,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_100xFaster",
+      "iterations": 1000,
+      "real_time": 100,
+      "cpu_time": 100,
+      "time_unit": "ns"
     }
   ]
 }
\ No newline at end of file
diff --git a/tools/gbench/report.py b/tools/gbench/report.py
index ac69b9b..015d33d 100644
--- a/tools/gbench/report.py
+++ b/tools/gbench/report.py
@@ -80,7 +80,9 @@
     first_line = "{:<{}s}     Time           CPU           Old           New".format(
         'Benchmark', first_col_width)
     output_strs = [first_line, '-' * len(first_line)]
-    for bn in json1['benchmarks']:
+
+    gen = (bn for bn in json1['benchmarks'] if 'real_time' in bn and 'cpu_time' in bn)
+    for bn in gen:
         other_bench = find_test(bn['name'])
         if not other_bench:
             continue
@@ -92,7 +94,7 @@
                 return BC_WHITE
             else:
                 return BC_CYAN
-        fmt_str = "{}{:<{}s}{endc}    {}{:+.2f}{endc}         {}{:+.2f}{endc}         {:4d}         {:4d}"
+        fmt_str = "{}{:<{}s}{endc}{}{:+9.2f}{endc}{}{:+14.2f}{endc}{:14d}{:14d}"
         tres = calculate_change(bn['real_time'], other_bench['real_time'])
         cpures = calculate_change(bn['cpu_time'], other_bench['cpu_time'])
         output_strs += [color_format(use_color, fmt_str,
@@ -121,19 +123,22 @@
 
     def test_basic(self):
         expect_lines = [
-            ['BM_SameTimes', '+0.00', '+0.00'],
-            ['BM_2xFaster', '-0.50', '-0.50'],
-            ['BM_2xSlower', '+1.00', '+1.00'],
-            ['BM_10PercentFaster', '-0.10', '-0.10'],
-            ['BM_10PercentSlower', '+0.10', '+0.10']
+            ['BM_SameTimes', '+0.00', '+0.00', '10', '10'],
+            ['BM_2xFaster', '-0.50', '-0.50', '50', '25'],
+            ['BM_2xSlower', '+1.00', '+1.00', '50', '100'],
+            ['BM_10PercentFaster', '-0.10', '-0.10', '100', '90'],
+            ['BM_10PercentSlower', '+0.10', '+0.10', '100', '110'],
+            ['BM_100xSlower', '+99.00', '+99.00', '100', '10000'],
+            ['BM_100xFaster', '-0.99', '-0.99', '10000', '100'],
         ]
         json1, json2 = self.load_results()
-        output_lines = generate_difference_report(json1, json2, use_color=False)
-        print output_lines
+        output_lines_with_header = generate_difference_report(json1, json2, use_color=False)
+        output_lines = output_lines_with_header[2:]
+        print("\n".join(output_lines_with_header))
         self.assertEqual(len(output_lines), len(expect_lines))
         for i in xrange(0, len(output_lines)):
             parts = [x for x in output_lines[i].split(' ') if x]
-            self.assertEqual(len(parts), 3)
+            self.assertEqual(len(parts), 5)
             self.assertEqual(parts, expect_lines[i])
 
 
diff --git a/tools/gbench/util.py b/tools/gbench/util.py
index 169b71c..07c2377 100644
--- a/tools/gbench/util.py
+++ b/tools/gbench/util.py
@@ -20,21 +20,21 @@
     """
     if not os.path.isfile(filename):
         return False
-    with open(filename, 'r') as f:
+    with open(filename, mode='rb') as f:
         magic_bytes = f.read(_num_magic_bytes)
     if sys.platform == 'darwin':
         return magic_bytes in [
-            '\xfe\xed\xfa\xce',  # MH_MAGIC
-            '\xce\xfa\xed\xfe',  # MH_CIGAM
-            '\xfe\xed\xfa\xcf',  # MH_MAGIC_64
-            '\xcf\xfa\xed\xfe',  # MH_CIGAM_64
-            '\xca\xfe\xba\xbe',  # FAT_MAGIC
-            '\xbe\xba\xfe\xca'   # FAT_CIGAM
+            b'\xfe\xed\xfa\xce',  # MH_MAGIC
+            b'\xce\xfa\xed\xfe',  # MH_CIGAM
+            b'\xfe\xed\xfa\xcf',  # MH_MAGIC_64
+            b'\xcf\xfa\xed\xfe',  # MH_CIGAM_64
+            b'\xca\xfe\xba\xbe',  # FAT_MAGIC
+            b'\xbe\xba\xfe\xca'   # FAT_CIGAM
         ]
     elif sys.platform.startswith('win'):
-        return magic_bytes == 'MZ'
+        return magic_bytes == b'MZ'
     else:
-        return magic_bytes == '\x7FELF'
+        return magic_bytes == b'\x7FELF'
 
 
 def is_json_file(filename):
@@ -68,7 +68,7 @@
     elif is_json_file(filename):
         ftype = IT_JSON
     else:
-        err_msg = "'%s' does not name a valid benchmark executable or JSON file"
+        err_msg = "'%s' does not name a valid benchmark executable or JSON file" % filename
     return ftype, err_msg
 
 
@@ -80,10 +80,30 @@
     """
     ftype, msg = classify_input_file(filename)
     if ftype == IT_Invalid:
-        print "Invalid input file: %s" % msg
+        print("Invalid input file: %s" % msg)
         sys.exit(1)
     return ftype
 
+def find_benchmark_flag(prefix, benchmark_flags):
+    """
+    Search the specified list of flags for a flag matching `<prefix><arg>` and
+    if it is found return the arg it specifies. If specified more than once the
+    last value is returned. If the flag is not found None is returned.
+    """
+    assert prefix.startswith('--') and prefix.endswith('=')
+    result = None
+    for f in benchmark_flags:
+        if f.startswith(prefix):
+            result = f[len(prefix):]
+    return result
+
+def remove_benchmark_flags(prefix, benchmark_flags):
+    """
+    Return a new list containing the specified benchmark_flags except those
+    with the specified prefix.
+    """
+    assert prefix.startswith('--') and prefix.endswith('=')
+    return [f for f in benchmark_flags if not f.startswith(prefix)]
 
 def load_benchmark_results(fname):
     """
@@ -101,16 +121,25 @@
     real time console output.
     RETURNS: A JSON object representing the benchmark output
     """
-    thandle, tname = tempfile.mkstemp()
-    os.close(thandle)
+    output_name = find_benchmark_flag('--benchmark_out=',
+                                      benchmark_flags)
+    is_temp_output = False
+    if output_name is None:
+        is_temp_output = True
+        thandle, output_name = tempfile.mkstemp()
+        os.close(thandle)
+        benchmark_flags = list(benchmark_flags) + \
+                          ['--benchmark_out=%s' % output_name]
+
     cmd = [exe_name] + benchmark_flags
     print("RUNNING: %s" % ' '.join(cmd))
-    exitCode = subprocess.call(cmd + ['--benchmark_out=%s' % tname])
+    exitCode = subprocess.call(cmd)
     if exitCode != 0:
         print('TEST FAILED...')
         sys.exit(exitCode)
-    json_res = load_benchmark_results(tname)
-    os.unlink(tname)
+    json_res = load_benchmark_results(output_name)
+    if is_temp_output:
+        os.unlink(output_name)
     return json_res