GPU support for RecurrentOp + Char RNN example
Summary: On batch size of 32 and other default parameters I get 70 iterations per second vs. 40 on CPU. batching still doesn't produce good loss, I am going to work on this in a separate diff
Reviewed By: urikz
Differential Revision: D4516566
fbshipit-source-id: d0611534747beb2cd935a8607a283369378e4a6c
diff --git a/caffe2/operators/recurrent_network_op.cc b/caffe2/operators/recurrent_network_op.cc
index d142a4a..9ca3598 100644
--- a/caffe2/operators/recurrent_network_op.cc
+++ b/caffe2/operators/recurrent_network_op.cc
@@ -75,4 +75,34 @@
REGISTER_GRADIENT(RecurrentNetwork, GetRecurrentNetworkGradient);
}
+
+namespace detail {
+void extractLinks(
+ OperatorBase* op,
+ const std::string& internalArg,
+ const std::string& externalArg,
+ const std::string offsetArg,
+ std::vector<detail::Link>* links) {
+ const auto& internal = op->GetRepeatedArgument<std::string>(internalArg);
+ const auto& external = op->GetRepeatedArgument<std::string>(externalArg);
+ const auto& offset = op->GetRepeatedArgument<int32_t>(offsetArg);
+ CAFFE_ENFORCE(
+ internal.size() == offset.size(),
+ "internal/offset mismatch: ",
+ internalArg,
+ externalArg);
+ CAFFE_ENFORCE(
+ external.size() == offset.size(),
+ "external/offset mismatch",
+ externalArg,
+ offsetArg);
+ for (auto i = 0; i < internal.size(); ++i) {
+ detail::Link l;
+ l.internal = internal[i];
+ l.external = external[i];
+ l.offset = offset[i];
+ links->push_back(l);
+ }
+}
+} // namespace detail
}
diff --git a/caffe2/operators/recurrent_network_op.h b/caffe2/operators/recurrent_network_op.h
index 07d8f81..bf29d39 100644
--- a/caffe2/operators/recurrent_network_op.h
+++ b/caffe2/operators/recurrent_network_op.h
@@ -139,28 +139,7 @@
const std::string& internalArg,
const std::string& externalArg,
const std::string offsetArg,
- std::vector<detail::Link>* links) {
- const auto& internal = op->GetRepeatedArgument<std::string>(internalArg);
- const auto& external = op->GetRepeatedArgument<std::string>(externalArg);
- const auto& offset = op->GetRepeatedArgument<int32_t>(offsetArg);
- CAFFE_ENFORCE(
- internal.size() == offset.size(),
- "internal/offset mismatch: ",
- internalArg,
- externalArg);
- CAFFE_ENFORCE(
- external.size() == offset.size(),
- "external/offset mismatch",
- externalArg,
- offsetArg);
- for (auto i = 0; i < internal.size(); ++i) {
- detail::Link l;
- l.internal = internal[i];
- l.external = external[i];
- l.offset = offset[i];
- links->push_back(l);
- }
-}
+ std::vector<detail::Link>* links);
} // namespace detail
template <typename T, class Context>
diff --git a/caffe2/operators/recurrent_network_op_gpu.cc b/caffe2/operators/recurrent_network_op_gpu.cc
new file mode 100644
index 0000000..16ae8c3
--- /dev/null
+++ b/caffe2/operators/recurrent_network_op_gpu.cc
@@ -0,0 +1,13 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/recurrent_network_op.h"
+
+namespace caffe2 {
+namespace {
+REGISTER_CUDA_OPERATOR(
+ RecurrentNetwork,
+ RecurrentNetworkOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+ RecurrentNetworkGradient,
+ RecurrentNetworkGradientOp<float, CUDAContext>);
+}
+}
diff --git a/caffe2/python/examples/char_rnn.py b/caffe2/python/examples/char_rnn.py
index 1f97a7c..3c091b1 100644
--- a/caffe2/python/examples/char_rnn.py
+++ b/caffe2/python/examples/char_rnn.py
@@ -5,10 +5,13 @@
from caffe2.python import core, workspace, cnn, utils
from caffe2.python.recurrent import LSTM
+from caffe2.proto import caffe2_pb2
+
import argparse
import logging
import numpy as np
+from datetime import datetime
logging.basicConfig()
@@ -26,11 +29,6 @@
class CharRNN(object):
- seq_length = 25
- batch_size = 1
- iters_to_report = 500
- hidden_size = 100
-
def __init__(self, args):
self.seq_length = args.seq_length
self.batch_size = args.batch_size
@@ -79,12 +77,12 @@
self.forward_net = core.Net(model.net.Proto())
xent = model.LabelCrossEntropy([softmax_reshaped, target], 'xent')
- loss = model.ReduceFrontSum(xent, 'loss')
+ loss = model.AveragedLoss(xent, 'loss')
model.AddGradientOperators([loss])
ITER = model.Iter("iter")
LR = model.LearningRate(
- ITER, "LR", base_lr=-0.1 / self.batch_size,
+ ITER, "LR", base_lr=-0.1 * self.seq_length,
policy="step", stepsize=1, gamma=0.9999)
ONE = model.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0)
@@ -127,6 +125,8 @@
# We iterate over text in a loop many times. Each time we peak
# seq_length segment and feed it to LSTM as a sequence
+ last_time = datetime.now()
+ progress = 0
while True:
workspace.FeedBlob(
"seq_lengths", np.array([self.seq_length]).astype(np.int32))
@@ -145,6 +145,7 @@
target[i * self.batch_size + e] =\
self._idx_at_pos((pos + 1) % N)
pos = (pos + 1) % N
+ progress += 1
workspace.FeedBlob('input_blob', input)
workspace.FeedBlob('target', target)
@@ -156,10 +157,22 @@
last_n_iter += 1
if num_iter % self.iters_to_report == 0:
+ new_time = datetime.now()
+ print("Characters Per Second: {}". format(
+ int(progress / (new_time - last_time).total_seconds())
+ ))
+ print("Iterations Per Second: {}". format(
+ int(self.iters_to_report /
+ (new_time - last_time).total_seconds())
+ ))
+
+ last_time = new_time
+ progress = 0
+
print("{} Iteration {} {}".
format('-' * 10, num_iter, '-' * 10))
- loss = workspace.FetchBlob(self.loss) / self.batch_size
+ loss = workspace.FetchBlob(self.loss) * self.seq_length
smooth_loss = 0.999 * smooth_loss + 0.001 * loss
last_n_loss += loss
@@ -188,8 +201,8 @@
input = np.zeros([1, self.batch_size, self.D]).astype(np.float32)
input[0][0][self.char_to_idx[ch]] = 1
- workspace.FeedBlob("input_blob", input)
+ workspace.FeedBlob("input_blob", input)
workspace.RunNet(self.forward_net.Name())
p = workspace.FetchBlob(self.predictions)
@@ -217,12 +230,17 @@
help="How often to report loss and generate text")
parser.add_argument("--hidden_size", type=int, default=100,
help="Dimention of the hidden representation")
+ parser.add_argument("--gpu", action="store_true",
+ help="If set, training is going to use GPU 0")
args = parser.parse_args()
- model = CharRNN(args)
- model.CreateModel()
- model.TrainModel()
+ device = core.DeviceOption(
+ caffe2_pb2.CUDA if args.gpu else caffe2_pb2.CPU, 0)
+ with core.DeviceScope(device):
+ model = CharRNN(args)
+ model.CreateModel()
+ model.TrainModel()
if __name__ == '__main__':