col max hist observer

Summary:
Add InputColumnMaxHistogramNetObserver and InputColumnMaxHistogramObserver to dnnlowp observers.

Sample output histogram at /mnt/public/amyyang/test/col_max_test.log (generated for ctr_web_feed)
```
columns:
        "op_index",
        "input_idx",
        "blob_name",
        "col_idx",
        "min",
        "max",
        "nbins"
```

Test Plan: Tested with ctr_web_feed

Reviewed By: csummersea

Differential Revision: D18194229

fbshipit-source-id: 1402fcdc174a1f52744c850f5e2cc3bdc73c3a45
diff --git a/caffe2/quantization/server/activation_distribution_observer.cc b/caffe2/quantization/server/activation_distribution_observer.cc
index da43212..5d9afc4 100644
--- a/caffe2/quantization/server/activation_distribution_observer.cc
+++ b/caffe2/quantization/server/activation_distribution_observer.cc
@@ -43,6 +43,40 @@
   fbgemm::FindMinMax(temp.data(), min, max, len);
 }
 
+float* GetFloatTensorData(TensorCPU* tensor) {
+  float* data = nullptr;
+  vector<float> data_temp;
+  if (tensor->IsType<float>()) {
+    if (!tensor->data<float>()) {
+      return nullptr;
+    }
+    data = tensor->template data<float>();
+  } else if (tensor->IsType<int>()) {
+    if (!tensor->data<int>()) {
+      return nullptr;
+    }
+    const int* data_orig = tensor->data<int>();
+    data_temp.resize(tensor->numel());
+    for (int j = 0; j < tensor->numel(); ++j) {
+      data_temp[j] = data_orig[j];
+    }
+    data = data_temp.data();
+  } else if (tensor->IsType<long>()) {
+    if (!tensor->data<long>()) {
+      return nullptr;
+    }
+    const long* data_orig = tensor->data<long>();
+    data_temp.resize(tensor->numel());
+    for (int j = 0; j < tensor->numel(); ++j) {
+      data_temp[j] = data_orig[j];
+    }
+    data = data_temp.data();
+  } else {
+    return nullptr;
+  }
+  return data;
+}
+
 template <>
 void FindMinMax<float>(const float* data, float* min, float* max, int len) {
   fbgemm::FindMinMax(data, min, max, len);
@@ -281,6 +315,71 @@
   return;
 }
 
+OutputColumnMaxHistogramObserver::OutputColumnMaxHistogramObserver(
+    OperatorBase* op,
+    const std::string& col_max_blob_name,
+    int nbins,
+    std::shared_ptr<HistogramObserver::Info> info)
+    : ObserverBase<OperatorBase>(op),
+      col_max_blob_name_(col_max_blob_name),
+      nbins_(nbins),
+      info_(info) {
+  const auto& output_names = op->debug_def().output();
+  auto it =
+      std::find(output_names.begin(), output_names.end(), col_max_blob_name);
+  CAFFE_ENFORCE(
+      it != output_names.end(), "Cannot find blob in operator output.");
+  col_max_blob_idx_ = std::distance(output_names.begin(), it);
+};
+
+void OutputColumnMaxHistogramObserver::Stop() {
+  if (!subject_->OutputIsTensorType(col_max_blob_idx_, CPU)) {
+    return;
+  }
+  Tensor* tensor = subject_->template Output<Tensor>(col_max_blob_idx_, CPU);
+  if (tensor->numel() == 0 || tensor->numel() == -1) {
+    return;
+  }
+
+  float* data = GetFloatTensorData(tensor);
+  if (data == nullptr && !warning_printed_) {
+    LOG(INFO) << "Tensor " << col_max_blob_name_
+              << " has mismatching type, or unsupported type "
+              << tensor->meta().name() << " with size " << tensor->numel();
+    warning_printed_ = true;
+    return;
+  }
+
+  // determine number of columns
+  CAFFE_ENFORCE(
+      tensor->dim() == 2,
+      "Tensor " + col_max_blob_name_ +
+          " is not two-dimensional. Tensor.dim() = " +
+          caffe2::to_string(tensor->dim()));
+  int num_columns = tensor->size_from_dim(1);
+  if (num_columns_ == -1) {
+    num_columns_ = num_columns;
+  }
+  CAFFE_ENFORCE(
+      num_columns_ == num_columns, "Observed inconsistent number of columns.");
+  int num_rows = tensor->size_to_dim(1);
+  for (int col = 0; col < num_columns; col++) {
+    // find col max of the ith column
+    auto col_max = std::abs(data[col]);
+    for (int r = 0; r < num_rows; r++) {
+      int idx = r * num_columns + col;
+      col_max = max(col_max, std::abs(data[idx]));
+    }
+    if (info_->histograms.size() <= col) {
+      info_->histograms.emplace_back(nbins_);
+      info_->total_histograms.emplace_back(nbins_);
+      info_->min_max_info.tensor_infos.emplace_back(col_max_blob_name_);
+    }
+    info_->histograms[col].Add(col_max);
+    info_->total_histograms[col].Add(col_max);
+  }
+}
+
 HistogramNetObserver::HistogramNetObserver(
     NetBase* subject,
     const string& out_file_name,
@@ -407,6 +506,136 @@
   return HasDNNLowPEngine_(op.debug_def());
 }
 
+OutputColumnMaxHistogramNetObserver::OutputColumnMaxHistogramNetObserver(
+    NetBase* subject,
+    const std::string& out_file_name,
+    const std::vector<std::string>& observe_column_max_for_blobs,
+    int nbins,
+    int dump_freq,
+    bool mul_nets)
+    : NetObserver(subject),
+      dump_freq_(dump_freq),
+      cnt_(0),
+      mul_nets_(mul_nets),
+      out_file_name_(out_file_name) {
+  if (observe_column_max_for_blobs.size() == 0) {
+    return;
+  }
+  col_max_blob_names_.insert(
+      observe_column_max_for_blobs.begin(), observe_column_max_for_blobs.end());
+  int op_idx = 0;
+  for (auto* op : subject->GetOperators()) {
+    const auto& op_output_names = op->debug_def().output();
+    int output_idx = 0;
+    std::unordered_map<int, std::shared_ptr<HistogramObserver::Info>>
+        output_col_hists_map;
+    for (const auto& output_blob : op_output_names) {
+      if (col_max_blob_names_.find(output_blob) == col_max_blob_names_.end()) {
+        ++output_idx;
+        continue;
+      }
+      /// create col max hist observer for blob
+      auto info = std::make_shared<HistogramObserver::Info>();
+      info->min_max_info.type = op->debug_def().type();
+      // number of histograms in info will be determined at runtime by the
+      // number of columns in the tensor.
+      OutputColumnMaxHistogramObserver* observer =
+          new OutputColumnMaxHistogramObserver(op, output_blob, nbins, info);
+      op->AttachObserver(
+          unique_ptr<OutputColumnMaxHistogramObserver>(observer));
+      output_col_hists_map[output_idx] = info;
+      ++output_idx;
+    }
+    if (output_col_hists_map.size() > 0) {
+      hist_infos_[op_idx] = output_col_hists_map;
+    }
+    ++op_idx;
+  }
+}
+
+void OutputColumnMaxHistogramNetObserver::DumpAndReset_(
+    const std::string& out_file_name,
+    bool print_total_min_max) {
+  stringstream file_name;
+  file_name << out_file_name;
+  if (mul_nets_) {
+    file_name << ".";
+    file_name << this;
+  }
+  ofstream f(file_name.str());
+  if (!f) {
+    LOG(WARNING) << this << ": can't open " << file_name.str();
+  }
+  for (const auto& it : hist_infos_) {
+    auto output_idx_hists_map = it.second;
+    for (const auto& output_idx_hist : output_idx_hists_map) {
+      int output_idx = output_idx_hist.first;
+      HistogramObserver::Info* info = output_idx_hist.second.get();
+      if (!info) {
+        continue;
+      }
+      for (int i = 0; i < info->histograms.size(); ++i) {
+        const Histogram* hist =
+            (print_total_min_max ? info->total_histograms : info->histograms)[i]
+                .Finalize();
+        if (hist->Min() >= hist->Max()) {
+          LOG(WARNING) << "Histogram of "
+                       << info->min_max_info.tensor_infos[i].name
+                       << " has an empty range: min " << hist->Min()
+                       << " and max " << hist->Max();
+        }
+        if (hist->GetHistogram()->empty()) {
+          LOG(WARNING) << "Histogram of "
+                       << info->min_max_info.tensor_infos[i].name
+                       << " is empty";
+        }
+        ostringstream ost;
+        // op_idx, output_idx, blob_name, col, min, max, nbins
+        ost << it.first << " " << output_idx << " "
+            << info->min_max_info.tensor_infos[i].name << " " << i << " "
+            << hist->Min() << " " << hist->Max() << " "
+            << hist->GetHistogram()->size();
+
+        // bins
+        for (uint64_t c : *hist->GetHistogram()) {
+          ost << " " << c;
+        }
+        if (print_total_min_max) {
+          LOG(INFO) << this << " " << ost.str();
+        }
+        f << ost.str() << endl;
+        if (!print_total_min_max) {
+          info->histograms[i] = DynamicHistogram(hist->GetHistogram()->size());
+        }
+      }
+    }
+  }
+  f.close();
+}
+
+void OutputColumnMaxHistogramNetObserver::Stop() {
+  ++cnt_;
+  if (dump_freq_ == -1 || (cnt_ % dump_freq_) != 0) {
+    return;
+  }
+  ostringstream ost;
+  size_t last_dot = out_file_name_.rfind('.');
+  size_t last_slash = out_file_name_.rfind('/');
+  if (last_dot != string::npos &&
+      (last_slash == string::npos || last_slash < last_dot)) {
+    ost << out_file_name_.substr(0, last_dot) << "_" << cnt_ / dump_freq_
+        << out_file_name_.substr(last_dot);
+  } else {
+    ost << out_file_name_ << "_" << cnt_ / dump_freq_;
+  }
+  DumpAndReset_(ost.str());
+  return;
+}
+
+OutputColumnMaxHistogramNetObserver::~OutputColumnMaxHistogramNetObserver() {
+  DumpAndReset_(out_file_name_, true);
+}
+
 RegisterQuantizationParamsNetObserver::RegisterQuantizationParamsNetObserver(
     NetBase* subject,
     const string& min_max_file_name,
diff --git a/caffe2/quantization/server/activation_distribution_observer.h b/caffe2/quantization/server/activation_distribution_observer.h
index 72ecab3..56deb6e 100644
--- a/caffe2/quantization/server/activation_distribution_observer.h
+++ b/caffe2/quantization/server/activation_distribution_observer.h
@@ -98,6 +98,29 @@
   bool warning_printed_ = false;
 }; // class HistogramObserver
 
+/**
+ * Given min/max, collect histogram of the max value of each column of tensor
+ */
+class OutputColumnMaxHistogramObserver final
+    : public ObserverBase<OperatorBase> {
+ public:
+  explicit OutputColumnMaxHistogramObserver(
+      OperatorBase* op,
+      const std::string& col_max_blob_name,
+      int nbins,
+      std::shared_ptr<HistogramObserver::Info> info);
+
+ private:
+  void Stop() override;
+
+  std::string col_max_blob_name_;
+  int nbins_;
+  std::shared_ptr<HistogramObserver::Info> info_;
+  bool warning_printed_ = false;
+  int col_max_blob_idx_ = -1;
+  int num_columns_ = -1;
+}; // class OutputColumnMaxHistogramObserver
+
 class HistogramNetObserver final : public NetObserver {
  public:
   /**
@@ -131,6 +154,34 @@
   std::vector<std::shared_ptr<HistogramObserver::Info>> hist_infos_;
 };
 
+class OutputColumnMaxHistogramNetObserver final : public NetObserver {
+ public:
+  explicit OutputColumnMaxHistogramNetObserver(
+      NetBase* subject,
+      const std::string& out_file_name,
+      const std::vector<std::string>& observe_column_max_for_blobs,
+      int nbins,
+      int dump_freq = -1,
+      bool mul_nets = false);
+  ~OutputColumnMaxHistogramNetObserver();
+
+ private:
+  void Stop() override;
+  void DumpAndReset_(
+      const std::string& out_file_name,
+      bool print_total_min_max = false);
+  int dump_freq_, cnt_;
+  bool mul_nets_;
+  const std::string out_file_name_;
+  std::unordered_set<std::string> col_max_blob_names_;
+
+  // {op_idx: {output_index: col_hists}}
+  std::unordered_map<
+      int,
+      std::unordered_map<int, std::shared_ptr<HistogramObserver::Info>>>
+      hist_infos_;
+};
+
 /**
  * Set quantization parameters of operators based on min/max
  * collected from OutputMinMaxObserver
diff --git a/caffe2/quantization/server/observer_test.py b/caffe2/quantization/server/observer_test.py
index be8770e..7fd6301 100644
--- a/caffe2/quantization/server/observer_test.py
+++ b/caffe2/quantization/server/observer_test.py
@@ -1,11 +1,10 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+from __future__ import absolute_import, division, print_function, unicode_literals
+
 import numpy as np
 from caffe2.python import core, workspace
 from caffe2.quantization.server import dnnlowp_pybind11
 
+
 net = core.Net("test_net")
 
 X = np.array([[1, 2], [3, 4]]).astype(np.float32)
@@ -32,3 +31,7 @@
 dnnlowp_pybind11.ObserveHistogramOfOutput("test_net.hist", 1)
 workspace.CreateNet(net)
 workspace.RunNet(net)
+
+dnnlowp_pybind11.AddOutputColumnMaxHistogramObserver(
+    net._net.name, "test_net._col_max_hist", ["X", "W"]
+)
diff --git a/caffe2/quantization/server/pybind.cc b/caffe2/quantization/server/pybind.cc
index 3c2cad0..9c4cce8 100644
--- a/caffe2/quantization/server/pybind.cc
+++ b/caffe2/quantization/server/pybind.cc
@@ -69,6 +69,40 @@
       pybind11::arg("mul_nets") = false);
 
   m.def(
+      "AddOutputColumnMaxHistogramObserver",
+      [](const string& net_name,
+         const string& out_file_name,
+         const std::vector<std::string>& observe_column_max_for_blobs,
+         int dump_freq,
+         bool mul_nets) {
+        Workspace* gWorkspace = caffe2::python::GetCurrentWorkspace();
+        CAFFE_ENFORCE(gWorkspace);
+        CAFFE_ENFORCE(
+            gWorkspace->GetNet(net_name), "Can't find net ", net_name);
+        pybind11::gil_scoped_release g;
+
+        NetBase* net = gWorkspace->GetNet(net_name);
+        const Observable<NetBase>::Observer* observer = nullptr;
+
+        observer = net->AttachObserver(
+            make_unique<OutputColumnMaxHistogramNetObserver>(
+                net,
+                out_file_name,
+                observe_column_max_for_blobs,
+                2048,
+                dump_freq,
+                mul_nets));
+
+        CAFFE_ENFORCE(observer != nullptr);
+        return pybind11::cast(observer);
+      },
+      pybind11::arg("net_name"),
+      pybind11::arg("out_file_name"),
+      pybind11::arg("observe_column_max_for_blobs"),
+      pybind11::arg("dump_freq") = -1,
+      pybind11::arg("mul_nets") = false);
+
+  m.def(
       "ChooseQuantizationParams",
       [](const std::string& blob_name) {
         Workspace* gWorkspace = caffe2::python::GetCurrentWorkspace();