src/benchmark.cc - platform/external/google-benchmark - Git at Google

 // Copyright 2015 Google Inc. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #include "benchmark/benchmark.h"

 #include <sys/time.h>
 #include <sys/resource.h>
 #include <unistd.h>

 #include <cstdlib>
 #include <cstring>
 #include <algorithm>
 #include <atomic>
 #include <condition_variable>
 #include <iostream>
 #include <memory>
 #include <thread>

 #include "check.h"
 #include "commandlineflags.h"
 #include "colorprint.h"
 #include "log.h"
 #include "mutex.h"
 #include "re.h"
 #include "stat.h"
 #include "string_util.h"
 #include "sysinfo.h"
 #include "walltime.h"

 DEFINE_string(benchmark_filter, ".",
               "A regular expression that specifies the set of benchmarks "
               "to execute.  If this flag is empty, no benchmarks are run.  "
               "If this flag is the string \"all\", all benchmarks linked "
               "into the process are run.");

 DEFINE_int32(benchmark_iterations, 0,
              "Total number of iterations per benchmark. 0 means the benchmarks "
              "are time-based.");

 DEFINE_double(benchmark_min_time, 0.5,
               "Minimum number of seconds we should run benchmark before "
               "results are considered significant.  For cpu-time based "
               "tests, this is the lower bound on the total cpu time "
               "used by all threads that make up the test.  For real-time "
               "based tests, this is the lower bound on the elapsed time "
               "of the benchmark execution, regardless of number of "
               "threads.");

 DEFINE_int32(benchmark_repetitions, 1,
              "The number of runs of each benchmark. If greater than 1, the "
              "mean and standard deviation of the runs will be reported.");

 DEFINE_bool(color_print, true, "Enables colorized logging.");

 DEFINE_int32(v, 0, "The level of verbose logging to output");


 // The ""'s catch people who don't pass in a literal for "str"
 #define strliterallen(str) (sizeof("" str "") - 1)

 // Must use a string literal for prefix.
 #define memprefix(str, len, prefix)                       \
   ((((len) >= strliterallen(prefix)) &&                   \
     std::memcmp(str, prefix, strliterallen(prefix)) == 0) \
        ? str + strliterallen(prefix)                      \
        : nullptr)


 namespace benchmark {

 namespace internal {

 // NOTE: This is a dummy "mutex" type used to denote the actual mutex
 // returned by GetBenchmarkLock(). This is only used to placate the thread
 // safety warnings by giving the return of GetBenchmarkLock() a name.
 struct CAPABILITY("mutex") BenchmarkLockType {};
 BenchmarkLockType BenchmarkLockVar;

 } // end namespace internal

 inline Mutex& RETURN_CAPABILITY(::benchmark::internal::BenchmarkLockVar)
 GetBenchmarkLock()
 {
   static Mutex lock;
   return lock;
 }

 namespace {

 // For non-dense Range, intermediate values are powers of kRangeMultiplier.
 static const int kRangeMultiplier = 8;
 static const int kMaxIterations = 1000000000;

 bool running_benchmark = false;

 // Global variable so that a benchmark can cause a little extra printing
 std::string* GetReportLabel() {
     static std::string label GUARDED_BY(GetBenchmarkLock());
     return &label;
 }

 // Should this benchmark base decisions off of real time rather than
 // cpu time?
 bool use_real_time GUARDED_BY(GetBenchmarkLock());

 // TODO(ericwf): support MallocCounter.
 //static benchmark::MallocCounter *benchmark_mc;

 static bool CpuScalingEnabled() {
   // On Linux, the CPUfreq subsystem exposes CPU information as files on the
   // local file system. If reading the exported files fails, then we may not be
   // running on Linux, so we silently ignore all the read errors.
   for (int cpu = 0, num_cpus = NumCPUs(); cpu < num_cpus; ++cpu) {
     std::string governor_file = StrCat("/sys/devices/system/cpu/cpu", cpu,
                                        "/cpufreq/scaling_governor");
     FILE* file = fopen(governor_file.c_str(), "r");
     if (!file) break;
     char buff[16];
     size_t bytes_read = fread(buff, 1, sizeof(buff), file);
     fclose(file);
     if (memprefix(buff, bytes_read, "performance") == nullptr) return true;
   }
   return false;
 }

 void ComputeStats(const std::vector<BenchmarkReporter::Run>& reports,
                   BenchmarkReporter::Run* mean_data,
                   BenchmarkReporter::Run* stddev_data) {
   CHECK(reports.size() >= 2) << "Cannot compute stats for less than 2 reports";
   // Accumulators.
   Stat1_d real_accumulated_time_stat;
   Stat1_d cpu_accumulated_time_stat;
   Stat1_d bytes_per_second_stat;
   Stat1_d items_per_second_stat;
   // All repetitions should be run with the same number of iterations so we
   // can take this information from the first benchmark.
   std::size_t const run_iterations = reports.front().iterations;

   // Populate the accumulators.
   for (BenchmarkReporter::Run const& run : reports) {
     CHECK_EQ(reports[0].benchmark_name, run.benchmark_name);
     CHECK_EQ(run_iterations, run.iterations);
     real_accumulated_time_stat +=
         Stat1_d(run.real_accumulated_time/run.iterations, run.iterations);
     cpu_accumulated_time_stat +=
         Stat1_d(run.cpu_accumulated_time/run.iterations, run.iterations);
     items_per_second_stat += Stat1_d(run.items_per_second, run.iterations);
     bytes_per_second_stat += Stat1_d(run.bytes_per_second, run.iterations);
   }

   // Get the data from the accumulator to BenchmarkReporter::Run's.
   mean_data->benchmark_name = reports[0].benchmark_name + "_mean";
   mean_data->iterations = run_iterations;
   mean_data->real_accumulated_time = real_accumulated_time_stat.Mean() *
                                      run_iterations;
   mean_data->cpu_accumulated_time = cpu_accumulated_time_stat.Mean() *
                                     run_iterations;
   mean_data->bytes_per_second = bytes_per_second_stat.Mean();
   mean_data->items_per_second = items_per_second_stat.Mean();

   // Only add label to mean/stddev if it is same for all runs
   mean_data->report_label = reports[0].report_label;
   for (std::size_t i = 1; i < reports.size(); i++) {
     if (reports[i].report_label != reports[0].report_label) {
       mean_data->report_label = "";
       break;
     }
   }

   stddev_data->benchmark_name = reports[0].benchmark_name + "_stddev";
   stddev_data->report_label = mean_data->report_label;
   stddev_data->iterations = 0;
   stddev_data->real_accumulated_time =
       real_accumulated_time_stat.StdDev();
   stddev_data->cpu_accumulated_time =
       cpu_accumulated_time_stat.StdDev();
   stddev_data->bytes_per_second = bytes_per_second_stat.StdDev();
   stddev_data->items_per_second = items_per_second_stat.StdDev();
 }

 struct ThreadStats {
     ThreadStats() : bytes_processed(0), items_processed(0) {}
     int64_t bytes_processed;
     int64_t items_processed;
 };

 // Timer management class
 class TimerManager {
  public:
   TimerManager(int num_threads, Notification* done)
       : num_threads_(num_threads),
         done_(done),
         running_(false),
         real_time_used_(0),
         cpu_time_used_(0),
         num_finalized_(0),
         phase_number_(0),
         entered_(0) {
   }

   // Called by each thread
   void StartTimer() EXCLUDES(lock_) {
     bool last_thread = false;
     {
       MutexLock ml(lock_);
       last_thread = Barrier(ml);
       if (last_thread) {
         CHECK(!running_) << "Called StartTimer when timer is already running";
         running_ = true;
         start_real_time_ = walltime::Now();
         start_cpu_time_ = MyCPUUsage() + ChildrenCPUUsage();
        }
      }
      if (last_thread) {
        phase_condition_.notify_all();
      }
   }

   // Called by each thread
   void StopTimer() EXCLUDES(lock_) {
     bool last_thread = false;
     {
       MutexLock ml(lock_);
       last_thread = Barrier(ml);
       if (last_thread) {
         CHECK(running_) << "Called StopTimer when timer is already stopped";
         InternalStop();
       }
     }
     if (last_thread) {
       phase_condition_.notify_all();
     }
   }

   // Called by each thread
   void Finalize() EXCLUDES(lock_) {
     MutexLock l(lock_);
     num_finalized_++;
     if (num_finalized_ == num_threads_) {
       CHECK(!running_) <<
         "The timer should be stopped before the timer is finalized";
       done_->Notify();
     }
   }

   // REQUIRES: timer is not running
   double real_time_used() EXCLUDES(lock_) {
     MutexLock l(lock_);
     CHECK(!running_);
     return real_time_used_;
   }

   // REQUIRES: timer is not running
   double cpu_time_used() EXCLUDES(lock_) {
     MutexLock l(lock_);
     CHECK(!running_);
     return cpu_time_used_;
   }

  private:
   Mutex lock_;
   Condition phase_condition_;
   int num_threads_;
   Notification* done_;

   bool running_;                // Is the timer running
   double start_real_time_;      // If running_
   double start_cpu_time_;       // If running_

   // Accumulated time so far (does not contain current slice if running_)
   double real_time_used_;
   double cpu_time_used_;

   // How many threads have called Finalize()
   int num_finalized_;

   // State for barrier management
   int phase_number_;
   int entered_;         // Number of threads that have entered this barrier

   void InternalStop() REQUIRES(lock_) {
     CHECK(running_);
     running_ = false;
     real_time_used_ += walltime::Now() - start_real_time_;
     cpu_time_used_ += ((MyCPUUsage() + ChildrenCPUUsage())
                        - start_cpu_time_);
   }

   // Enter the barrier and wait until all other threads have also
   // entered the barrier.  Returns iff this is the last thread to
   // enter the barrier.
   bool Barrier(MutexLock& ml) REQUIRES(lock_) {
     CHECK_LT(entered_, num_threads_);
     entered_++;
     if (entered_ < num_threads_) {
       // Wait for all threads to enter
       int phase_number_cp = phase_number_;
       auto cb = [this, phase_number_cp]() {
         return this->phase_number_ > phase_number_cp;
       };
       phase_condition_.wait(ml.native_handle(), cb);
       return false;  // I was not the last one
     } else {
       // Last thread has reached the barrier
       phase_number_++;
       entered_ = 0;
       return true;
     }
   }
 };

 // TimerManager for current run.
 static std::unique_ptr<TimerManager> timer_manager = nullptr;

 } // end namespace

 namespace internal {

 // Information kept per benchmark we may want to run
 struct Benchmark::Instance {
   std::string   name;
   Function*     function;
   bool          has_arg1;
   int           arg1;
   bool          has_arg2;
   int           arg2;
   int           threads;    // Number of concurrent threads to use
   bool          multithreaded;  // Is benchmark multi-threaded?
 };


 // Class for managing registered benchmarks.  Note that each registered
 // benchmark identifies a family of related benchmarks to run.
 class BenchmarkFamilies {
  public:
   static BenchmarkFamilies* GetInstance();

   // Registers a benchmark family and returns the index assigned to it.
   size_t AddBenchmark(Benchmark* family);

   // Unregisters a family at the given index.
   void RemoveBenchmark(size_t index);

   // Extract the list of benchmark instances that match the specified
   // regular expression.
   bool FindBenchmarks(const std::string& re,
                       std::vector<Benchmark::Instance>* benchmarks);
  private:
   BenchmarkFamilies();
   ~BenchmarkFamilies();

   std::vector<Benchmark*> families_;
   Mutex mutex_;
 };


 BenchmarkFamilies* BenchmarkFamilies::GetInstance() {
   static BenchmarkFamilies instance;
   return &instance;
 }

 BenchmarkFamilies::BenchmarkFamilies() { }

 BenchmarkFamilies::~BenchmarkFamilies() {
   for (internal::Benchmark* family : families_) {
     delete family;
   }
 }

 size_t BenchmarkFamilies::AddBenchmark(Benchmark* family) {
   MutexLock l(mutex_);
   // This loop attempts to reuse an entry that was previously removed to avoid
   // unncessary growth of the vector.
   for (size_t index = 0; index < families_.size(); ++index) {
     if (families_[index] == nullptr) {
       families_[index] = family;
       return index;
     }
   }
   size_t index = families_.size();
   families_.push_back(family);
   return index;
 }

 void BenchmarkFamilies::RemoveBenchmark(size_t index) {
   MutexLock l(mutex_);
   families_[index] = nullptr;
   // Don't shrink families_ here, we might be called by the destructor of
   // BenchmarkFamilies which iterates over the vector.
 }

 bool BenchmarkFamilies::FindBenchmarks(
     const std::string& spec,
     std::vector<Benchmark::Instance>* benchmarks) {
   // Make regular expression out of command-line flag
   std::string error_msg;
   Regex re;
   if (!re.Init(spec, &error_msg)) {
     std::cerr << "Could not compile benchmark re: " << error_msg << std::endl;
     return false;
   }

   // Special list of thread counts to use when none are specified
   std::vector<int> one_thread;
   one_thread.push_back(1);

   MutexLock l(mutex_);
   for (Benchmark* family : families_) {
     // Family was deleted or benchmark doesn't match
     if (family == nullptr || !re.Match(family->name_)) continue;

     if (family->arg_count_ == -1) {
       family->arg_count_ = 0;
       family->args_.emplace_back(-1, -1);
     }
     for (auto const& args : family->args_) {
       const std::vector<int>* thread_counts =
         (family->thread_counts_.empty()
          ? &one_thread
          : &family->thread_counts_);
       for (int num_threads : *thread_counts) {

         Benchmark::Instance instance;
         instance.name = family->name_;
         instance.function = family->function_;
         instance.has_arg1 = family->arg_count_ >= 1;
         instance.arg1 = args.first;
         instance.has_arg2 = family->arg_count_ == 2;
         instance.arg2 = args.second;
         instance.threads = num_threads;
         instance.multithreaded = !(family->thread_counts_.empty());

         // Add arguments to instance name
         if (family->arg_count_ >= 1) {
           AppendHumanReadable(instance.arg1, &instance.name);
         }
         if (family->arg_count_ >= 2) {
           AppendHumanReadable(instance.arg2, &instance.name);
         }

         // Add the number of threads used to the name
         if (!family->thread_counts_.empty()) {
           instance.name += StringPrintF("/threads:%d", instance.threads);
         }

         benchmarks->push_back(instance);
       }
     }
   }
   return true;
 }


 Benchmark::Benchmark(const char* name,
                      Function* f)
                     : name_(name), function_(f), arg_count_(-1) {
   registration_index_ = BenchmarkFamilies::GetInstance()->AddBenchmark(this);
 }

 Benchmark::~Benchmark()  {
   BenchmarkFamilies::GetInstance()->RemoveBenchmark(registration_index_);
 }

 Benchmark* Benchmark::Arg(int x) {
   CHECK(arg_count_ == -1 || arg_count_ == 1);
   arg_count_ = 1;
   args_.emplace_back(x, -1);
   return this;
 }

 Benchmark* Benchmark::Range(int start, int limit) {
   CHECK(arg_count_ == -1 || arg_count_ == 1);
   arg_count_ = 1;
   std::vector<int> arglist;
   AddRange(&arglist, start, limit, kRangeMultiplier);

   for (int i : arglist) {
     args_.emplace_back(i, -1);
   }
   return this;
 }

 Benchmark* Benchmark::DenseRange(int start, int limit) {
   CHECK(arg_count_ == -1 || arg_count_ == 1);
   arg_count_ = 1;
   CHECK_GE(start, 0);
   CHECK_LE(start, limit);
   for (int arg = start; arg <= limit; arg++) {
     args_.emplace_back(arg, -1);
   }
   return this;
 }

 Benchmark* Benchmark::ArgPair(int x, int y) {
   CHECK(arg_count_ == -1 || arg_count_ == 2);
   arg_count_ = 2;
   args_.emplace_back(x, y);
   return this;
 }

 Benchmark* Benchmark::RangePair(int lo1, int hi1, int lo2, int hi2) {
   CHECK(arg_count_ == -1 || arg_count_ == 2);
   arg_count_ = 2;
   std::vector<int> arglist1, arglist2;
   AddRange(&arglist1, lo1, hi1, kRangeMultiplier);
   AddRange(&arglist2, lo2, hi2, kRangeMultiplier);

   for (int i : arglist1) {
     for (int j : arglist2) {
       args_.emplace_back(i, j);
     }
   }
   return this;
 }

 Benchmark* Benchmark::Apply(void (*custom_arguments)(Benchmark* benchmark)) {
   custom_arguments(this);
   return this;
 }

 Benchmark* Benchmark::Threads(int t) {
   CHECK_GT(t, 0);
   thread_counts_.push_back(t);
   return this;
 }

 Benchmark* Benchmark::ThreadRange(int min_threads, int max_threads) {
   CHECK_GT(min_threads, 0);
   CHECK_GE(max_threads, min_threads);

   AddRange(&thread_counts_, min_threads, max_threads, 2);
   return this;
 }

 Benchmark* Benchmark::ThreadPerCpu() {
   static int num_cpus = NumCPUs();
   thread_counts_.push_back(num_cpus);
   return this;
 }

 void Benchmark::AddRange(std::vector<int>* dst, int lo, int hi, int mult) {
   CHECK_GE(lo, 0);
   CHECK_GE(hi, lo);

   // Add "lo"
   dst->push_back(lo);

   static const int kint32max = std::numeric_limits<int32_t>::max();

   // Now space out the benchmarks in multiples of "mult"
   for (int32_t i = 1; i < kint32max/mult; i *= mult) {
     if (i >= hi) break;
     if (i > lo) {
       dst->push_back(i);
     }
   }
   // Add "hi" (if different from "lo")
   if (hi != lo) {
     dst->push_back(hi);
   }
 }

 } // end namespace internal

 namespace {


 // Execute one thread of benchmark b for the specified number of iterations.
 // Adds the stats collected for the thread into *total.
 void RunInThread(const benchmark::internal::Benchmark::Instance* b,
                  int iters, int thread_id,
                  ThreadStats* total) EXCLUDES(GetBenchmarkLock()) {
   State st(iters, b->has_arg1, b->arg1, b->has_arg2, b->arg2, thread_id);
   b->function(st);
   CHECK(st.iterations() == st.max_iterations) <<
     "Benchmark returned before State::KeepRunning() returned false!";
   {
     MutexLock l(GetBenchmarkLock());
     total->bytes_processed += st.bytes_processed();
     total->items_processed += st.items_processed();
   }

   timer_manager->Finalize();
 }

 void RunBenchmark(const benchmark::internal::Benchmark::Instance& b,
                   const BenchmarkReporter* br) EXCLUDES(GetBenchmarkLock()) {
   int iters = FLAGS_benchmark_iterations ? FLAGS_benchmark_iterations
                                          : 1;
   std::vector<BenchmarkReporter::Run> reports;

   std::vector<std::thread> pool;
   if (b.multithreaded)
     pool.resize(b.threads);

   for (int i = 0; i < FLAGS_benchmark_repetitions; i++) {
     std::string mem;
     while (true) {
       // Try benchmark
       VLOG(2) << "Running " << b.name << " for " << iters << "\n";

       {
         MutexLock l(GetBenchmarkLock());
         GetReportLabel()->clear();
         use_real_time = false;
       }

       Notification done;
       timer_manager = std::unique_ptr<TimerManager>(new TimerManager(b.threads, &done));

       ThreadStats total;
       running_benchmark = true;
       if (b.multithreaded) {
         // If this is out first iteration of the while(true) loop then the
         // threads haven't been started and can't be joined. Otherwise we need
         // to join the thread before replacing them.
         for (std::thread& thread : pool) {
           if (thread.joinable())
             thread.join();
         }
         for (std::size_t ti = 0; ti < pool.size(); ++ti) {
             pool[ti] = std::thread(&RunInThread, &b, iters, ti, &total);
         }
       } else {
         // Run directly in this thread
         RunInThread(&b, iters, 0, &total);
       }
       done.WaitForNotification();
       running_benchmark = false;

       const double cpu_accumulated_time = timer_manager->cpu_time_used();
       const double real_accumulated_time = timer_manager->real_time_used();
       timer_manager.reset();

       VLOG(2) << "Ran in " << cpu_accumulated_time << "/"
               << real_accumulated_time << "\n";

       // Base decisions off of real time if requested by this benchmark.
       double seconds = cpu_accumulated_time;
       std::string label;
       {
         MutexLock l(GetBenchmarkLock());
         label = *GetReportLabel();
         if (use_real_time) {
           seconds = real_accumulated_time;
         }
       }

       // If this was the first run, was elapsed time or cpu time large enough?
       // If this is not the first run, go with the current value of iter.
       if ((i > 0) ||
           (iters == FLAGS_benchmark_iterations) ||
           (iters >= kMaxIterations) ||
           (seconds >= FLAGS_benchmark_min_time) ||
           (real_accumulated_time >= 5*FLAGS_benchmark_min_time)) {
         double bytes_per_second = 0;
         if (total.bytes_processed > 0 && seconds != 0.0) {
           bytes_per_second = (total.bytes_processed / seconds);
         }
         double items_per_second = 0;
         if (total.items_processed > 0 && seconds != 0.0) {
           items_per_second = (total.items_processed / seconds);
         }

         // Create report about this benchmark run.
         BenchmarkReporter::Run report;
         report.benchmark_name = b.name;
         report.report_label = label;
         // Report the total iterations across all threads.
         report.iterations = static_cast<int64_t>(iters) * b.threads;
         report.real_accumulated_time = real_accumulated_time;
         report.cpu_accumulated_time = cpu_accumulated_time;
         report.bytes_per_second = bytes_per_second;
         report.items_per_second = items_per_second;
         reports.push_back(report);
         break;
       }

       // See how much iterations should be increased by
       // Note: Avoid division by zero with max(seconds, 1ns).
       double multiplier = FLAGS_benchmark_min_time * 1.4 / std::max(seconds, 1e-9);
       // If our last run was at least 10% of FLAGS_benchmark_min_time then we
       // use the multiplier directly. Otherwise we use at most 10 times
       // expansion.
       // NOTE: When the last run was at least 10% of the min time the max
       // expansion should be 14x.
       bool is_significant = (seconds / FLAGS_benchmark_min_time) > 0.1;
       multiplier = is_significant ? multiplier : std::min(10.0, multiplier);
       if (multiplier <= 1.0) multiplier = 2.0;
       double next_iters = std::max(multiplier * iters, iters + 1.0);
       if (next_iters > kMaxIterations) {
         next_iters = kMaxIterations;
       }
       VLOG(3) << "Next iters: " << next_iters << ", " << multiplier << "\n";
       iters = static_cast<int>(next_iters + 0.5);
     }
   }
   br->ReportRuns(reports);
   if (b.multithreaded) {
     for (std::thread& thread : pool)
       thread.join();
   }
 }

 }  // namespace

 State::State(size_t max_iters, bool has_x, int x, bool has_y, int y,
              int thread_i)
     : started_(false), total_iterations_(0),
       has_range_x_(has_x), range_x_(x),
       has_range_y_(has_y), range_y_(y),
       bytes_processed_(0), items_processed_(0),
       thread_index(thread_i),
       max_iterations(max_iters)
 {
     CHECK(max_iterations != 0) << "At least one iteration must be run";
 }

 void State::PauseTiming() {
   // Add in time accumulated so far
   CHECK(running_benchmark);
   timer_manager->StopTimer();
 }

 void State::ResumeTiming() {
   CHECK(running_benchmark);
   timer_manager->StartTimer();
 }

 void State::UseRealTime() {
   MutexLock l(GetBenchmarkLock());
   use_real_time = true;
 }

 void State::SetLabel(const char* label) {
   CHECK(running_benchmark);
   MutexLock l(GetBenchmarkLock());
   *GetReportLabel() = label;
 }

 BenchmarkReporter::~BenchmarkReporter() {}

 namespace internal {

 bool ConsoleReporter::ReportContext(const Context& context) const {
   name_field_width_ = context.name_field_width;

   fprintf(stdout,
           "Run on (%d X %0.0f MHz CPU%s)\n",
           context.num_cpus,
           context.mhz_per_cpu,
           (context.num_cpus > 1) ? "s" : "");

   int remainder_us;
   std::string walltime_str = walltime::Print(
                                 walltime::Now(), "%Y/%m/%d-%H:%M:%S",
                                 true,  // use local timezone
                                 &remainder_us);
   fprintf(stdout, "%s\n", walltime_str.c_str());

   if (context.cpu_scaling_enabled) {
     fprintf(stdout, "***WARNING*** CPU scaling is enabled, the benchmark "
                     "timings may be noisy\n");
   }

 #ifndef NDEBUG
   fprintf(stdout, "Build Type: DEBUG\n");
 #endif

   int output_width =
       fprintf(stdout,
               "%-*s %10s %10s %10s\n",
               static_cast<int>(name_field_width_),
               "Benchmark",
               "Time(ns)", "CPU(ns)",
               "Iterations");
   fprintf(stdout, "%s\n", std::string(output_width - 1, '-').c_str());

   return true;
 }

 void ConsoleReporter::ReportRuns(
     const std::vector<Run>& reports) const {
   if (reports.empty()) {
     return;
   }

   for (Run const& run : reports) {
     CHECK_EQ(reports[0].benchmark_name, run.benchmark_name);
     PrintRunData(run);
   }

   if (reports.size() < 2) {
     // We don't report aggregated data if there was a single run.
     return;
   }

   Run mean_data;
   Run stddev_data;
   ComputeStats(reports, &mean_data, &stddev_data);

   // Output using PrintRun.
   PrintRunData(mean_data);
   PrintRunData(stddev_data);
   fprintf(stdout, "\n");
 }

 void ConsoleReporter::PrintRunData(const Run& result) const {
   // Format bytes per second
   std::string rate;
   if (result.bytes_per_second > 0) {
     rate = StrCat(" ", HumanReadableNumber(result.bytes_per_second), "B/s");
   }

   // Format items per second
   std::string items;
   if (result.items_per_second > 0) {
     items = StrCat(" ", HumanReadableNumber(result.items_per_second),
                    " items/s");
   }

   double const multiplier = 1e9; // nano second multiplier
   ColorPrintf(COLOR_GREEN, "%-*s ",
               name_field_width_, result.benchmark_name.c_str());
   if (result.iterations == 0) {
     ColorPrintf(COLOR_YELLOW, "%10.0f %10.0f ",
                 result.real_accumulated_time * multiplier,
                 result.cpu_accumulated_time * multiplier);
   } else {
     ColorPrintf(COLOR_YELLOW, "%10.0f %10.0f ",
                 (result.real_accumulated_time * multiplier) /
                     (static_cast<double>(result.iterations)),
                 (result.cpu_accumulated_time * multiplier) /
                     (static_cast<double>(result.iterations)));
   }
   ColorPrintf(COLOR_CYAN, "%10lld", result.iterations);
   ColorPrintf(COLOR_DEFAULT, "%*s %*s %s\n",
               13, rate.c_str(),
               18, items.c_str(),
               result.report_label.c_str());
 }

 void RunMatchingBenchmarks(const std::string& spec,
                            const BenchmarkReporter* reporter) {
   CHECK(reporter != nullptr);
   if (spec.empty()) return;

   std::vector<benchmark::internal::Benchmark::Instance> benchmarks;
   auto families = benchmark::internal::BenchmarkFamilies::GetInstance();
   if (!families->FindBenchmarks(spec, &benchmarks)) return;


   // Determine the width of the name field using a minimum width of 10.
   // Also determine max number of threads needed.
   size_t name_field_width = 10;
   for (const internal::Benchmark::Instance& benchmark : benchmarks) {
     // Add width for _stddev and threads:XX
     if (benchmark.threads > 1 && FLAGS_benchmark_repetitions > 1) {
       name_field_width =
           std::max<size_t>(name_field_width, benchmark.name.size() + 17);
     } else if (benchmark.threads > 1) {
       name_field_width =
           std::max<size_t>(name_field_width, benchmark.name.size() + 10);
     } else if (FLAGS_benchmark_repetitions > 1) {
       name_field_width =
           std::max<size_t>(name_field_width, benchmark.name.size() + 7);
     } else {
       name_field_width =
           std::max<size_t>(name_field_width, benchmark.name.size());
     }
   }

   // Print header here
   BenchmarkReporter::Context context;
   context.num_cpus = NumCPUs();
   context.mhz_per_cpu = CyclesPerSecond() / 1000000.0f;

   context.cpu_scaling_enabled = CpuScalingEnabled();
   context.name_field_width = name_field_width;

   if (reporter->ReportContext(context)) {
     for (const auto& benchmark : benchmarks) {
       RunBenchmark(benchmark, reporter);
     }
   }
 }

 } // end namespace internal

 void RunSpecifiedBenchmarks() {
   RunSpecifiedBenchmarks(nullptr);
 }

 void RunSpecifiedBenchmarks(const BenchmarkReporter* reporter) {
   std::string spec = FLAGS_benchmark_filter;
   if (spec.empty() || spec == "all")
     spec = ".";  // Regexp that matches all benchmarks
   internal::ConsoleReporter default_reporter;
   internal::RunMatchingBenchmarks(spec, reporter ? reporter : &default_reporter);
 }

 namespace internal {

 void PrintUsageAndExit() {
   fprintf(stdout,
           "benchmark"
           " [--benchmark_filter=<regex>]\n"
           "          [--benchmark_iterations=<iterations>]\n"
           "          [--benchmark_min_time=<min_time>]\n"
           "          [--benchmark_repetitions=<num_repetitions>]\n"
           "          [--color_print={true|false}]\n"
           "          [--v=<verbosity>]\n");
   exit(0);
 }

 void ParseCommandLineFlags(int* argc, const char** argv) {
   using namespace benchmark;
   for (int i = 1; i < *argc; ++i) {
     if (
         ParseStringFlag(argv[i], "benchmark_filter",
                         &FLAGS_benchmark_filter) ||
         ParseInt32Flag(argv[i], "benchmark_iterations",
                        &FLAGS_benchmark_iterations) ||
         ParseDoubleFlag(argv[i], "benchmark_min_time",
                         &FLAGS_benchmark_min_time) ||
         ParseInt32Flag(argv[i], "benchmark_repetitions",
                        &FLAGS_benchmark_repetitions) ||
         ParseBoolFlag(argv[i], "color_print",
                        &FLAGS_color_print) ||
         ParseInt32Flag(argv[i], "v", &FLAGS_v)) {
       for (int j = i; j != *argc; ++j) argv[j] = argv[j + 1];

       --(*argc);
       --i;
     } else if (IsFlag(argv[i], "help")) {
       PrintUsageAndExit();
     }
   }
 }

 } // end namespace internal

 void Initialize(int* argc, const char** argv) {
   internal::ParseCommandLineFlags(argc, argv);
   internal::SetLogLevel(FLAGS_v);
   // TODO remove this. It prints some output the first time it is called.
   // We don't want to have this ouput printed during benchmarking.
   MyCPUUsage();
   // The first call to walltime::Now initialized it. Call it once to
   // prevent the initialization from happening in a benchmark.
   walltime::Now();
 }

 } // end namespace benchmark