Dump data for DocNN visualization

Summary:
- Dump instance activations, some statistics about each neuron for model introspection visualization in flow
- It is a part of minsuk's summer intern project. See the following link for high-level details: https://www.dropbox.com/s/m89rwpoomqkc9jb/aml-talk-nnvis-minsuk.pptx?dl=0
- Will combine the following two visualizations: https://our.intern.facebook.com/intern/fblearner/c2graphvis/13795371/ and https://our.intern.facebook.com/intern/fblearner/model-introspection-nn/11910201/

Differential Revision: D4303679

fbshipit-source-id: eeac699891b17cea0b29324d584937460a8d7a25
diff --git a/caffe2/python/introspect_vis.py b/caffe2/python/introspect_vis.py
new file mode 100644
index 0000000..68b3668
--- /dev/null
+++ b/caffe2/python/introspect_vis.py
@@ -0,0 +1,104 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+import numpy as np
+import json
+
+
+class IntrospectVisData():
+
+    def __init__(self, model_name, first_outputs, meta_info, lab_arr):
+        self.model_name = model_name
+        self.max_num_instances = 1000
+        self.count = 0
+        self.instances = []
+        self.labels = [{"id": i, "name": c} for i, c in enumerate(lab_arr)]
+        self.is_multilabel = True if 'multi_label' in meta_info\
+                             and meta_info['multi_label'] else False
+
+        self.neuron_groups = [{"idx": i, "name": b,
+                               "size": len(first_outputs[2][i -
+                                           len(meta_info['output_names'])][0])}
+                              for i, b in enumerate(meta_info['output_names'])]
+        self.summaries = map(lambda x: np.array([[0. for _ in range(x['size'])]
+                                                 for _ in range(len(self.labels))]),
+                             self.neuron_groups)
+
+    def getInstanceActivations(self, outputs):
+        outputs = outputs[(-1) * len(self.neuron_groups):]
+        return [[round(_val, 4) for _val in out[0]] for out in outputs]
+
+    def updateNeuronSummaries(self, activations, true_idxs):
+        self.count += 1
+        for out_idx in range(len(self.summaries)):
+            if self.is_multilabel:
+                for true_idx in true_idxs:
+                    self.summaries[out_idx][true_idx] += activations[out_idx]
+            else:
+                self.summaries[out_idx][true_idxs] += activations[out_idx]
+
+    def appendInstance(self, instance):
+        self.instances.append(instance)
+
+    def processInstance(self, idx, labels, scores, outputs, model_specific):
+        activations = []
+        if self.model_name in ['DocNN']:
+            activations = self.getInstanceActivations(outputs)
+            self.updateNeuronSummaries(activations, labels)
+        if idx < self.max_num_instances:
+            if len(activations) == 0:
+                activations = self.getInstanceActivations(outputs),
+            instance = {
+                "id": idx,
+                "labels": labels,
+                "scores": scores,
+                "activations": activations,
+            }
+            for key, val in model_specific.items():
+                instance[key] = val
+            self.appendInstance(instance)
+
+    def updateArrangements(self):
+        if self.model_name in ['DocNN']:
+            # sort class scores based on score values
+            for instance in self.instances:
+                instance['scores'] =\
+                    sorted([{"class_id": j, "score": round(_s, 3)}
+                            for j, _s in enumerate(instance['scores'])],
+                           key=lambda x: x['score'], reverse=True)
+            # instance positions based on scores
+            inst_sort_vals = [[] for _ in range(len(self.labels))]
+            for i, x in enumerate(self.instances):
+                sort_val = 1.0
+                # if multi_label, get the first label
+                label = x['labels'] if type(x['labels']) == int\
+                                    else x['labels'][0]
+                if label == x['scores'][0]['class_id']:
+                    # How much score difference from that of rank 2 class
+                    sort_val = x['scores'][0]['score'] - x['scores'][1]['score']
+                else:
+                    # How much score difference from that of rank 1 class
+                    sort_val = x['scores'][label]['score'] -\
+                        x['scores'][0]['score']
+                inst_sort_vals[label].append({"inst_id": i, "val": sort_val})
+            for class_id, inst_vals in enumerate(inst_sort_vals):
+                for i, r in enumerate(sorted(inst_vals, key=lambda x: x['val'],
+                                             reverse=True)):
+                    self.instances[r['inst_id']]['position'] = i
+
+    def postprocess(self, filepath):
+        self.neuron_summaries = [np.around((np.swapaxes(_s, 0, 1) /
+                                           float(self.count)), 4).tolist()
+                                 for _s in self.summaries] if self.count > 0 else None
+
+        self.updateArrangements()
+
+        with open(filepath, 'w') as vf:
+            json.dump({
+                "model_type": self.model_name,
+                "neuron_groups": self.neuron_groups,
+                "classes": self.labels,
+                "instances": self.instances,
+                "neuron_summaries": self.neuron_summaries,
+            }, vf)