GPU support for RecurrentOp + Char RNN example

Summary: On batch size of 32 and other default parameters I get 70 iterations per second vs. 40 on CPU. batching still doesn't produce good loss, I am going to work on this in a separate diff

Reviewed By: urikz

Differential Revision: D4516566

fbshipit-source-id: d0611534747beb2cd935a8607a283369378e4a6c
diff --git a/caffe2/operators/recurrent_network_op.cc b/caffe2/operators/recurrent_network_op.cc
index d142a4a..9ca3598 100644
--- a/caffe2/operators/recurrent_network_op.cc
+++ b/caffe2/operators/recurrent_network_op.cc
@@ -75,4 +75,34 @@
 
 REGISTER_GRADIENT(RecurrentNetwork, GetRecurrentNetworkGradient);
 }
+
+namespace detail {
+void extractLinks(
+    OperatorBase* op,
+    const std::string& internalArg,
+    const std::string& externalArg,
+    const std::string offsetArg,
+    std::vector<detail::Link>* links) {
+  const auto& internal = op->GetRepeatedArgument<std::string>(internalArg);
+  const auto& external = op->GetRepeatedArgument<std::string>(externalArg);
+  const auto& offset = op->GetRepeatedArgument<int32_t>(offsetArg);
+  CAFFE_ENFORCE(
+      internal.size() == offset.size(),
+      "internal/offset mismatch: ",
+      internalArg,
+      externalArg);
+  CAFFE_ENFORCE(
+      external.size() == offset.size(),
+      "external/offset mismatch",
+      externalArg,
+      offsetArg);
+  for (auto i = 0; i < internal.size(); ++i) {
+    detail::Link l;
+    l.internal = internal[i];
+    l.external = external[i];
+    l.offset = offset[i];
+    links->push_back(l);
+  }
+}
+} // namespace detail
 }
diff --git a/caffe2/operators/recurrent_network_op.h b/caffe2/operators/recurrent_network_op.h
index 07d8f81..bf29d39 100644
--- a/caffe2/operators/recurrent_network_op.h
+++ b/caffe2/operators/recurrent_network_op.h
@@ -139,28 +139,7 @@
     const std::string& internalArg,
     const std::string& externalArg,
     const std::string offsetArg,
-    std::vector<detail::Link>* links) {
-  const auto& internal = op->GetRepeatedArgument<std::string>(internalArg);
-  const auto& external = op->GetRepeatedArgument<std::string>(externalArg);
-  const auto& offset = op->GetRepeatedArgument<int32_t>(offsetArg);
-  CAFFE_ENFORCE(
-      internal.size() == offset.size(),
-      "internal/offset mismatch: ",
-      internalArg,
-      externalArg);
-  CAFFE_ENFORCE(
-      external.size() == offset.size(),
-      "external/offset mismatch",
-      externalArg,
-      offsetArg);
-  for (auto i = 0; i < internal.size(); ++i) {
-    detail::Link l;
-    l.internal = internal[i];
-    l.external = external[i];
-    l.offset = offset[i];
-    links->push_back(l);
-  }
-}
+    std::vector<detail::Link>* links);
 } // namespace detail
 
 template <typename T, class Context>
diff --git a/caffe2/operators/recurrent_network_op_gpu.cc b/caffe2/operators/recurrent_network_op_gpu.cc
new file mode 100644
index 0000000..16ae8c3
--- /dev/null
+++ b/caffe2/operators/recurrent_network_op_gpu.cc
@@ -0,0 +1,13 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/recurrent_network_op.h"
+
+namespace caffe2 {
+namespace {
+REGISTER_CUDA_OPERATOR(
+    RecurrentNetwork,
+    RecurrentNetworkOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    RecurrentNetworkGradient,
+    RecurrentNetworkGradientOp<float, CUDAContext>);
+}
+}
diff --git a/caffe2/python/examples/char_rnn.py b/caffe2/python/examples/char_rnn.py
index 1f97a7c..3c091b1 100644
--- a/caffe2/python/examples/char_rnn.py
+++ b/caffe2/python/examples/char_rnn.py
@@ -5,10 +5,13 @@
 
 from caffe2.python import core, workspace, cnn, utils
 from caffe2.python.recurrent import LSTM
+from caffe2.proto import caffe2_pb2
+
 
 import argparse
 import logging
 import numpy as np
+from datetime import datetime
 
 
 logging.basicConfig()
@@ -26,11 +29,6 @@
 
 
 class CharRNN(object):
-    seq_length = 25
-    batch_size = 1
-    iters_to_report = 500
-    hidden_size = 100
-
     def __init__(self, args):
         self.seq_length = args.seq_length
         self.batch_size = args.batch_size
@@ -79,12 +77,12 @@
         self.forward_net = core.Net(model.net.Proto())
 
         xent = model.LabelCrossEntropy([softmax_reshaped, target], 'xent')
-        loss = model.ReduceFrontSum(xent, 'loss')
+        loss = model.AveragedLoss(xent, 'loss')
         model.AddGradientOperators([loss])
 
         ITER = model.Iter("iter")
         LR = model.LearningRate(
-            ITER, "LR", base_lr=-0.1 / self.batch_size,
+            ITER, "LR", base_lr=-0.1 * self.seq_length,
             policy="step", stepsize=1, gamma=0.9999)
         ONE = model.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0)
 
@@ -127,6 +125,8 @@
 
         # We iterate over text in a loop many times. Each time we peak
         # seq_length segment and feed it to LSTM as a sequence
+        last_time = datetime.now()
+        progress = 0
         while True:
             workspace.FeedBlob(
                 "seq_lengths", np.array([self.seq_length]).astype(np.int32))
@@ -145,6 +145,7 @@
                     target[i * self.batch_size + e] =\
                         self._idx_at_pos((pos + 1) % N)
                     pos = (pos + 1) % N
+                    progress += 1
 
             workspace.FeedBlob('input_blob', input)
             workspace.FeedBlob('target', target)
@@ -156,10 +157,22 @@
             last_n_iter += 1
 
             if num_iter % self.iters_to_report == 0:
+                new_time = datetime.now()
+                print("Characters Per Second: {}". format(
+                    int(progress / (new_time - last_time).total_seconds())
+                ))
+                print("Iterations Per Second: {}". format(
+                    int(self.iters_to_report /
+                        (new_time - last_time).total_seconds())
+                ))
+
+                last_time = new_time
+                progress = 0
+
                 print("{} Iteration {} {}".
                       format('-' * 10, num_iter, '-' * 10))
 
-            loss = workspace.FetchBlob(self.loss) / self.batch_size
+            loss = workspace.FetchBlob(self.loss) * self.seq_length
             smooth_loss = 0.999 * smooth_loss + 0.001 * loss
             last_n_loss += loss
 
@@ -188,8 +201,8 @@
 
             input = np.zeros([1, self.batch_size, self.D]).astype(np.float32)
             input[0][0][self.char_to_idx[ch]] = 1
-            workspace.FeedBlob("input_blob", input)
 
+            workspace.FeedBlob("input_blob", input)
             workspace.RunNet(self.forward_net.Name())
 
             p = workspace.FetchBlob(self.predictions)
@@ -217,12 +230,17 @@
                         help="How often to report loss and generate text")
     parser.add_argument("--hidden_size", type=int, default=100,
                         help="Dimention of the hidden representation")
+    parser.add_argument("--gpu", action="store_true",
+                        help="If set, training is going to use GPU 0")
 
     args = parser.parse_args()
 
-    model = CharRNN(args)
-    model.CreateModel()
-    model.TrainModel()
+    device = core.DeviceOption(
+        caffe2_pb2.CUDA if args.gpu else caffe2_pb2.CPU, 0)
+    with core.DeviceScope(device):
+        model = CharRNN(args)
+        model.CreateModel()
+        model.TrainModel()
 
 
 if __name__ == '__main__':