| ## @package lstm_benchmark |
| # Module caffe2.python.lstm_benchmark |
| from __future__ import absolute_import |
| from __future__ import division |
| from __future__ import print_function |
| from __future__ import unicode_literals |
| |
| from caffe2.proto import caffe2_pb2 |
| from caffe2.python import cnn, workspace, core, utils, rnn_cell |
| |
| import argparse |
| import numpy as np |
| import time |
| |
| import logging |
| |
| logging.basicConfig() |
| log = logging.getLogger("lstm_bench") |
| log.setLevel(logging.DEBUG) |
| |
| |
| def generate_data(T, shape, num_labels): |
| ''' |
| Fill a queue with input data |
| ''' |
| log.info("Generating T={} sequence batches".format(T)) |
| |
| generate_input_init_net = core.Net('generate_input_init') |
| queue = generate_input_init_net.CreateBlobsQueue( |
| [], "inputqueue", num_blobs=1, capacity=T, |
| ) |
| label_queue = generate_input_init_net.CreateBlobsQueue( |
| [], "labelqueue", num_blobs=1, capacity=T, |
| ) |
| |
| workspace.RunNetOnce(generate_input_init_net) |
| generate_input_net = core.Net('generate_input') |
| |
| generate_input_net.EnqueueBlobs([queue, "scratch"], ["scratch"]) |
| generate_input_net.EnqueueBlobs([label_queue, "label_scr"], ["label_scr"]) |
| np.random.seed(2603) |
| |
| for t in range(T): |
| if (t % 50 == 0): |
| print("Generating data {}/{}".format(t, T)) |
| # Randomize the seqlength |
| random_shape = [np.random.randint(1, shape[0])] + shape[1:] |
| X = np.random.rand(*random_shape).astype(np.float32) |
| batch_size = random_shape[1] |
| L = num_labels * batch_size |
| labels = (np.random.rand(random_shape[0]) * L).astype(np.int32) |
| workspace.FeedBlob("scratch", X) |
| workspace.FeedBlob("label_scr", labels) |
| workspace.RunNetOnce(generate_input_net.Proto()) |
| |
| log.info("Finished data generation") |
| |
| return queue, label_queue |
| |
| |
| def create_model(args, queue, label_queue, input_shape): |
| model = cnn.CNNModelHelper(name="LSTM_bench") |
| seq_lengths, hidden_init, cell_init, target = \ |
| model.net.AddExternalInputs( |
| 'seq_lengths', |
| 'hidden_init', |
| 'cell_init', |
| 'target', |
| ) |
| input_blob = model.DequeueBlobs(queue, "input_data") |
| labels = model.DequeueBlobs(label_queue, "label") |
| |
| if args.implementation == "own": |
| output, last_hidden, _, last_state = rnn_cell.LSTM( |
| model=model, |
| input_blob=input_blob, |
| seq_lengths=seq_lengths, |
| initial_states=(hidden_init, cell_init), |
| dim_in=args.input_dim, |
| dim_out=args.hidden_dim, |
| scope="lstm1", |
| memory_optimization=args.memory_optimization, |
| ) |
| elif args.implementation == "cudnn": |
| # We need to feed a placeholder input so that RecurrentInitOp |
| # can infer the dimensions. |
| model.param_init_net.ConstantFill([], input_blob, shape=input_shape) |
| output, last_hidden, _ = rnn_cell.cudnn_LSTM( |
| model=model, |
| input_blob=input_blob, |
| initial_states=(hidden_init, cell_init), |
| dim_in=args.input_dim, |
| dim_out=args.hidden_dim, |
| scope="cudnnlstm", |
| ) |
| |
| else: |
| assert False, "Unknown implementation" |
| |
| weights = model.UniformFill(labels, "weights") |
| softmax, loss = model.SoftmaxWithLoss( |
| [model.Flatten(output), labels, weights], |
| ['softmax', 'loss'], |
| ) |
| |
| model.AddGradientOperators([loss]) |
| |
| # carry states over |
| model.net.Copy(last_hidden, hidden_init) |
| model.net.Copy(last_hidden, cell_init) |
| |
| workspace.FeedBlob(hidden_init, np.zeros( |
| [1, args.batch_size, args.hidden_dim], dtype=np.float32 |
| )) |
| workspace.FeedBlob(cell_init, np.zeros( |
| [1, args.batch_size, args.hidden_dim], dtype=np.float32 |
| )) |
| return model, output |
| |
| |
| def Caffe2LSTM(args): |
| T = args.data_size // args.batch_size |
| |
| input_blob_shape = [args.seq_length, args.batch_size, args.input_dim] |
| queue, label_queue = generate_data(T // args.seq_length, |
| input_blob_shape, |
| args.hidden_dim) |
| |
| workspace.FeedBlob( |
| "seq_lengths", |
| np.array([args.seq_length] * args.batch_size, dtype=np.int32) |
| ) |
| |
| model, output = create_model(args, queue, label_queue, input_blob_shape) |
| |
| workspace.RunNetOnce(model.param_init_net) |
| workspace.CreateNet(model.net) |
| |
| last_time = time.time() |
| start_time = last_time |
| num_iters = T // args.seq_length |
| entries_per_iter = args.seq_length * args.batch_size |
| |
| # Run the Benchmark |
| log.info("------ Starting benchmark ------") |
| for iteration in range(0, num_iters, args.iters_to_report): |
| iters_once = min(args.iters_to_report, num_iters - iteration) |
| workspace.RunNet(model.net.Proto().name, iters_once) |
| |
| new_time = time.time() |
| log.info("Iter: {} / {}. Entries Per Second: {}k.". format( |
| iteration, |
| num_iters, |
| entries_per_iter * iters_once / (new_time - last_time) // 1000, |
| )) |
| last_time = new_time |
| |
| log.info("Done. Total EPS: {}k".format( |
| entries_per_iter * num_iters / (time.time() - start_time) // 1000, |
| )) |
| |
| |
| @utils.debug |
| def Benchmark(args): |
| Caffe2LSTM(args) |
| |
| |
| def GetArgumentParser(): |
| parser = argparse.ArgumentParser(description="LSTM benchmark.") |
| |
| parser.add_argument( |
| "--hidden_dim", |
| type=int, |
| default=40, |
| help="Hidden dimension", |
| ) |
| parser.add_argument( |
| "--input_dim", |
| type=int, |
| default=40, |
| help="Input dimension", |
| ) |
| parser.add_argument( |
| "--batch_size", |
| type=int, |
| default=256, |
| help="The batch size." |
| ) |
| parser.add_argument( |
| "--seq_length", |
| type=int, |
| default=20, |
| help="Max sequence length" |
| ) |
| parser.add_argument( |
| "--data_size", |
| type=int, |
| default=10000000, |
| help="Number of data points to generate" |
| ) |
| parser.add_argument( |
| "--iters_to_report", |
| type=int, |
| default=100, |
| help="Number of iteration to report progress" |
| ) |
| parser.add_argument( |
| "--gpu", |
| action="store_true", |
| help="Run all on GPU", |
| ) |
| parser.add_argument( |
| "--implementation", |
| type=str, |
| default="own", |
| help="'cudnn' or 'own'", |
| ) |
| parser.add_argument( |
| "--memory_optimization", |
| action="store_true", |
| help="Whether to use memory optimized LSTM or not", |
| ) |
| |
| return parser |
| |
| |
| if __name__ == '__main__': |
| args = GetArgumentParser().parse_args() |
| |
| workspace.GlobalInit([ |
| 'caffe2', |
| '--caffe2_log_level=0', |
| '--caffe2_print_blob_sizes_at_exit=0']) |
| |
| device = core.DeviceOption( |
| caffe2_pb2.CUDA if args.gpu else caffe2_pb2.CPU, 0) |
| |
| with core.DeviceScope(device): |
| Benchmark(args) |