caffe2/python/mkl/convnet_benchmarks.py - platform/external/pytorch - Git at Google

 # Copyright (c) 2016-present, Facebook, Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ##############################################################################

 ## @package convnet_benchmarks
 # Module caffe2.python.convnet_benchmarks
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
 """
 Benchmark for common convnets.

 Speed on Titan X, with 10 warmup steps and 10 main steps and with different
 versions of cudnn, are as follows (time reported below is per-batch time,
 forward / forward+backward):

                     CuDNN V3        CuDNN v4
 AlexNet         32.5 / 108.0    27.4 /  90.1
 OverFeat       113.0 / 342.3    91.7 / 276.5
 Inception      134.5 / 485.8   125.7 / 450.6
 VGG (batch 64) 200.8 / 650.0   164.1 / 551.7

 Speed on Inception with varied batch sizes and CuDNN v4 is as follows:

 Batch Size   Speed per batch     Speed per image
  16             22.8 /  72.7         1.43 / 4.54
  32             38.0 / 127.5         1.19 / 3.98
  64             67.2 / 233.6         1.05 / 3.65
 128            125.7 / 450.6         0.98 / 3.52

 Speed on Tesla M40, which 10 warmup steps and 10 main steps and with cudnn
 v4, is as follows:

 AlexNet         68.4 / 218.1
 OverFeat       210.5 / 630.3
 Inception      300.2 / 1122.2
 VGG (batch 64) 405.8 / 1327.7

 (Note that these numbers involve a "full" backprop, i.e. the gradient
 with respect to the input image is also computed.)

 To get the numbers, simply run:

 for MODEL in AlexNet OverFeat Inception; do
   PYTHONPATH=../gen:$PYTHONPATH python convnet_benchmarks.py \
     --batch_size 128 --model $MODEL --forward_only True
 done
 for MODEL in AlexNet OverFeat Inception; do
   PYTHONPATH=../gen:$PYTHONPATH python convnet_benchmarks.py \
     --batch_size 128 --model $MODEL
 done
 PYTHONPATH=../gen:$PYTHONPATH python convnet_benchmarks.py \
   --batch_size 64 --model VGGA --forward_only True
 PYTHONPATH=../gen:$PYTHONPATH python convnet_benchmarks.py \
   --batch_size 64 --model VGGA

 for BS in 16 32 64 128; do
   PYTHONPATH=../gen:$PYTHONPATH python convnet_benchmarks.py \
     --batch_size $BS --model Inception --forward_only True
   PYTHONPATH=../gen:$PYTHONPATH python convnet_benchmarks.py \
     --batch_size $BS --model Inception
 done

 Note that VGG needs to be run at batch 64 due to memory limit on the backward
 pass.
 """

 import argparse

 from caffe2.python import brew, cnn, workspace
 from caffe2.python.model_helper import ModelHelper

 from caffe2.python.models import resnet
 import numpy as np

 def MLP(order, cudnn_ws, mkl):
     model = ModelHelper(name="benchmark")
     d = 256
     depth = 20
     width = 3
     for i in range(depth):
         for j in range(width):
             current = "fc_{}_{}".format(i, j) if i > 0 else "data"
             next_ = "fc_{}_{}".format(i + 1, j)
             brew.fc(
                 model,
                 current, next_,
                 dim_in=d, dim_out=d,
                 weight_init=('XavierFill', {}),
                 bias_init=('XavierFill', {}))

     brew.sum(model, ["fc_{}_{}".format(depth, j) for j in range(width)], ["sum"])
     brew.fc(model, "sum", "last",
              dim_in=d, dim_out=1000,
              weight_init=('XavierFill', {}),
              bias_init=('XavierFill', {}))
     xent = model.LabelCrossEntropy(["last", "label"], "xent")
     if not mkl:
         model.AveragedLoss(xent, "loss")
     return model, d


 def ResNet50(order, cudnn_ws, mkl):
     my_arg_scope = {'order': order, 'use_cudnn': True,
                     'cudnn_exhaustive_search': True,
                     'ws_nbytes_limit': str(cudnn_ws)}
     model = ModelHelper(name="alexnet", arg_scope=my_arg_scope)
     resnet.create_resnet50(model, "data", 3, 1000, is_test=True,
                            final_avg_kernel=14)
     return model, 448

 def AlexNet(order, cudnn_ws, mkl):
     my_arg_scope = {'order': order, 'use_cudnn': True,
                     'cudnn_exhaustive_search': True,
                     'ws_nbytes_limit': str(cudnn_ws)}
     model = ModelHelper(name="alexnet", arg_scope=my_arg_scope)
     conv1 = brew.conv(
         model,
         "data",
         "conv1",
         3,
         64,
         11,
         ('XavierFill', {}),
         ('ConstantFill', {}),
         stride=4,
         pad=2
     )
     relu1 = brew.relu(model, conv1, "conv1")
     pool1 = brew.max_pool(model, relu1, "pool1", kernel=3, stride=2)
     conv2 = brew.conv(
         model,
         pool1,
         "conv2",
         64,
         192,
         5,
         ('XavierFill', {}),
         ('ConstantFill', {}),
         pad=2
     )
     relu2 = brew.relu(model, conv2, "conv2")
     pool2 = brew.max_pool(model, relu2, "pool2", kernel=3, stride=2)
     conv3 = brew.conv(
         model,
         pool2,
         "conv3",
         192,
         384,
         3,
         ('XavierFill', {}),
         ('ConstantFill', {}),
         pad=1
     )
     relu3 = brew.relu(model, conv3, "conv3")
     conv4 = brew.conv(
         model,
         relu3,
         "conv4",
         384,
         256,
         3,
         ('XavierFill', {}),
         ('ConstantFill', {}),
         pad=1
     )
     relu4 = brew.relu(model, conv4, "conv4")
     conv5 = brew.conv(
         model,
         relu4,
         "conv5",
         256,
         256,
         3,
         ('XavierFill', {}),
         ('ConstantFill', {}),
         pad=1
     )
     relu5 = brew.relu(model, conv5, "conv5")
     pool5 = brew.max_pool(model, relu5, "pool5", kernel=3, stride=2)
     fc6 = brew.fc(
         model, pool5, "fc6", 256 * 6 * 6, 4096, ('XavierFill', {}),
         ('ConstantFill', {})
     )
     relu6 = brew.relu(model, fc6, "fc6")
     fc7 = brew.fc(
         model, relu6, "fc7", 4096, 4096, ('XavierFill', {}), ('ConstantFill', {})
     )
     relu7 = brew.relu(model, fc7, "fc7")
     fc8 = brew.fc(
         model, relu7, "fc8", 4096, 1000, ('XavierFill', {}), ('ConstantFill', {})
     )
     pred = brew.softmax(model, fc8, "pred")
     xent = model.LabelCrossEntropy([pred, "label"], "xent")
     if not mkl:
         loss = model.AveragedLoss(xent, "loss")
     return model, 224


 def OverFeat(order, cudnn_ws, mkl):
     my_arg_scope = {'order': order, 'use_cudnn': True,
                     'cudnn_exhaustive_search': True,
                     'ws_nbytes_limit': str(cudnn_ws)}
     model = ModelHelper(name='overfeat', arg_scope=my_arg_scope)
     conv1 = brew.conv(
         model,
         "data",
         "conv1",
         3,
         96,
         11,
         ('XavierFill', {}),
         ('ConstantFill', {}),
         stride=4
     )
     relu1 = brew.relu(model, conv1, "conv1")
     pool1 = brew.max_pool(model, relu1, "pool1", kernel=2, stride=2)
     conv2 = brew.conv(
         model, pool1, "conv2", 96, 256, 5, ('XavierFill', {}), ('ConstantFill', {})
     )
     relu2 = brew.relu(model, conv2, "conv2")
     pool2 = brew.max_pool(model, relu2, "pool2", kernel=2, stride=2)
     conv3 = brew.conv(
         model,
         pool2,
         "conv3",
         256,
         512,
         3,
         ('XavierFill', {}),
         ('ConstantFill', {}),
         pad=1
     )
     relu3 = brew.relu(model, conv3, "conv3")
     conv4 = brew.conv(
         model,
         relu3,
         "conv4",
         512,
         1024,
         3,
         ('XavierFill', {}),
         ('ConstantFill', {}),
         pad=1
     )
     relu4 = brew.relu(model, conv4, "conv4")
     conv5 = brew.conv(
         model,
         relu4,
         "conv5",
         1024,
         1024,
         3,
         ('XavierFill', {}),
         ('ConstantFill', {}),
         pad=1
     )
     relu5 = brew.relu(model, conv5, "conv5")
     pool5 = brew.max_pool(model, relu5, "pool5", kernel=2, stride=2)
     fc6 = brew.fc(
         model, pool5, "fc6", 1024 * 6 * 6, 3072, ('XavierFill', {}),
         ('ConstantFill', {})
     )
     relu6 = brew.relu(model, fc6, "fc6")
     fc7 = brew.fc(
         model, relu6, "fc7", 3072, 4096, ('XavierFill', {}), ('ConstantFill', {})
     )
     relu7 = brew.relu(model, fc7, "fc7")
     fc8 = brew.fc(
         model, relu7, "fc8", 4096, 1000, ('XavierFill', {}), ('ConstantFill', {})
     )
     pred = brew.softmax(model, fc8, "pred")
     xent = model.LabelCrossEntropy([pred, "label"], "xent")
     if not mkl:
         loss = model.AveragedLoss(xent, "loss")
     return model, 231


 def VGGA(order, cudnn_ws, mkl):
     my_arg_scope = {'order': order, 'use_cudnn': True,
                     'cudnn_exhaustive_search': True,
                     'ws_nbytes_limit': str(cudnn_ws)}
     model = ModelHelper(name='vgg-a', arg_scope=my_arg_scope)
     conv1 = brew.conv(
         model,
         "data",
         "conv1",
         3,
         64,
         3,
         ('XavierFill', {}),
         ('ConstantFill', {}),
         pad=1
     )
     relu1 = brew.relu(model, conv1, "conv1")
     pool1 = brew.max_pool(model, relu1, "pool1", kernel=2, stride=2)
     conv2 = brew.conv(
         model,
         pool1,
         "conv2",
         64,
         128,
         3,
         ('XavierFill', {}),
         ('ConstantFill', {}),
         pad=1
     )
     relu2 = brew.relu(model, conv2, "conv2")
     pool2 = brew.max_pool(model, relu2, "pool2", kernel=2, stride=2)
     conv3 = brew.conv(
         model,
         pool2,
         "conv3",
         128,
         256,
         3,
         ('XavierFill', {}),
         ('ConstantFill', {}),
         pad=1
     )
     relu3 = brew.relu(model, conv3, "conv3")
     conv4 = brew.conv(
         model,
         relu3,
         "conv4",
         256,
         256,
         3,
         ('XavierFill', {}),
         ('ConstantFill', {}),
         pad=1
     )
     relu4 = brew.relu(model, conv4, "conv4")
     pool4 = brew.max_pool(model, relu4, "pool4", kernel=2, stride=2)
     conv5 = brew.conv(
         model,
         pool4,
         "conv5",
         256,
         512,
         3,
         ('XavierFill', {}),
         ('ConstantFill', {}),
         pad=1
     )
     relu5 = brew.relu(model, conv5, "conv5")
     conv6 = brew.conv(
         model,
         relu5,
         "conv6",
         512,
         512,
         3,
         ('XavierFill', {}),
         ('ConstantFill', {}),
         pad=1
     )
     relu6 = brew.relu(model, conv6, "conv6")
     pool6 = brew.max_pool(model, relu6, "pool6", kernel=2, stride=2)
     conv7 = brew.conv(
         model,
         pool6,
         "conv7",
         512,
         512,
         3,
         ('XavierFill', {}),
         ('ConstantFill', {}),
         pad=1
     )
     relu7 = brew.relu(model, conv7, "conv7")
     conv8 = brew.conv(
         model,
         relu7,
         "conv8",
         512,
         512,
         3,
         ('XavierFill', {}),
         ('ConstantFill', {}),
         pad=1
     )
     relu8 = brew.relu(model, conv8, "conv8")
     pool8 = brew.max_pool(model, relu8, "pool8", kernel=2, stride=2)

     fcix = brew.fc(
         model, pool8, "fcix", 512 * 7 * 7, 4096, ('XavierFill', {}),
         ('ConstantFill', {})
     )
     reluix = brew.relu(model, fcix, "fcix")
     fcx = brew.fc(
         model, reluix, "fcx", 4096, 4096, ('XavierFill', {}), ('ConstantFill', {})
     )
     relux = brew.relu(model, fcx, "fcx")
     fcxi = brew.fc(
         model, relux, "fcxi", 4096, 1000, ('XavierFill', {}), ('ConstantFill', {})
     )
     pred = brew.softmax(model, fcxi, "pred")
     xent = model.LabelCrossEntropy([pred, "label"], "xent")
     if not mkl:
         loss = model.AveragedLoss(xent, "loss")
     return model, 231


 def _InceptionModule(
     model, input_blob, input_depth, output_name, conv1_depth, conv3_depths,
     conv5_depths, pool_depth
 ):
     # path 1: 1x1 conv
     conv1 = brew.conv(
         model, input_blob, output_name + ":conv1", input_depth, conv1_depth, 1,
         ('XavierFill', {}), ('ConstantFill', {})
     )
     conv1 = brew.relu(model, conv1, conv1)
     # path 2: 1x1 conv + 3x3 conv
     conv3_reduce = brew.conv(
         model, input_blob, output_name + ":conv3_reduce", input_depth,
         conv3_depths[0], 1, ('XavierFill', {}), ('ConstantFill', {})
     )
     conv3_reduce = brew.relu(model, conv3_reduce, conv3_reduce)
     conv3 = brew.conv(
         model,
         conv3_reduce,
         output_name + ":conv3",
         conv3_depths[0],
         conv3_depths[1],
         3,
         ('XavierFill', {}),
         ('ConstantFill', {}),
         pad=1
     )
     conv3 = brew.relu(model, conv3, conv3)
     # path 3: 1x1 conv + 5x5 conv
     conv5_reduce = brew.conv(
         model, input_blob, output_name + ":conv5_reduce", input_depth,
         conv5_depths[0], 1, ('XavierFill', {}), ('ConstantFill', {})
     )
     conv5_reduce = brew.relu(model, conv5_reduce, conv5_reduce)
     conv5 = brew.conv(
         model,
         conv5_reduce,
         output_name + ":conv5",
         conv5_depths[0],
         conv5_depths[1],
         5,
         ('XavierFill', {}),
         ('ConstantFill', {}),
         pad=2
     )
     conv5 = brew.relu(model, conv5, conv5)
     # path 4: pool + 1x1 conv
     pool = brew.max_pool(
         model,
         input_blob,
         output_name + ":pool",
         kernel=3,
         stride=1,
         pad=1
     )
     pool_proj = brew.conv(
         model, pool, output_name + ":pool_proj", input_depth, pool_depth, 1,
         ('XavierFill', {}), ('ConstantFill', {})
     )
     pool_proj = brew.relu(model, pool_proj, pool_proj)
     output = brew.concat(model, [conv1, conv3, conv5, pool_proj], output_name)
     return output


 def Inception(order, cudnn_ws, mkl):
     my_arg_scope = {'order': order, 'use_cudnn': True,
                     'cudnn_exhaustive_search': True,
                     'ws_nbytes_limit': str(cudnn_ws)}
     model = ModelHelper(name="inception", arg_scope=my_arg_scope)
     conv1 = brew.conv(
         model,
         "data",
         "conv1",
         3,
         64,
         7,
         ('XavierFill', {}),
         ('ConstantFill', {}),
         stride=2,
         pad=3
     )
     relu1 = brew.relu(model, conv1, "conv1")
     pool1 = brew.max_pool(model, relu1, "pool1", kernel=3, stride=2, pad=1)
     conv2a = brew.conv(
         model, pool1, "conv2a", 64, 64, 1,
         ('XavierFill', {}), ('ConstantFill', {})
     )
     conv2a = brew.relu(model, conv2a, conv2a)
     conv2 = brew.conv(
         model,
         conv2a,
         "conv2",
         64,
         192,
         3,
         ('XavierFill', {}),
         ('ConstantFill', {}),
         pad=1
     )
     relu2 = brew.relu(model, conv2, "conv2")
     pool2 = brew.max_pool(model, relu2, "pool2", kernel=3, stride=2, pad=1)
     # Inception modules
     inc3 = _InceptionModule(
         model, pool2, 192, "inc3", 64, [96, 128], [16, 32], 32
     )
     inc4 = _InceptionModule(
         model, inc3, 256, "inc4", 128, [128, 192], [32, 96], 64
     )
     pool5 = brew.max_pool(model, inc4, "pool5", kernel=3, stride=2, pad=1)
     inc5 = _InceptionModule(
         model, pool5, 480, "inc5", 192, [96, 208], [16, 48], 64
     )
     inc6 = _InceptionModule(
         model, inc5, 512, "inc6", 160, [112, 224], [24, 64], 64
     )
     inc7 = _InceptionModule(
         model, inc6, 512, "inc7", 128, [128, 256], [24, 64], 64
     )
     inc8 = _InceptionModule(
         model, inc7, 512, "inc8", 112, [144, 288], [32, 64], 64
     )
     inc9 = _InceptionModule(
         model, inc8, 528, "inc9", 256, [160, 320], [32, 128], 128
     )
     pool9 = brew.max_pool(model, inc9, "pool9", kernel=3, stride=2, pad=1)
     inc10 = _InceptionModule(
         model, pool9, 832, "inc10", 256, [160, 320], [32, 128], 128
     )
     inc11 = _InceptionModule(
         model, inc10, 832, "inc11", 384, [192, 384], [48, 128], 128
     )
     pool11 = brew.average_pool(model, inc11, "pool11", kernel=7, stride=1)
     fc = brew.fc(
         model, pool11, "fc", 1024, 1000,
         ('XavierFill', {}), ('ConstantFill', {})
     )
     # It seems that Soumith's benchmark does not have softmax on top
     # for Inception. We will add it anyway so we can have a proper
     # backward pass.
     pred = brew.softmax(model, fc, "pred")
     xent = model.LabelCrossEntropy([pred, "label"], "xent")
     if not mkl:
         loss = model.AveragedLoss(xent, "loss")
     return model, 224


 def AddParameterUpdate(model):
     """ Simple plain SGD update -- not tuned to actually train the models """
     ITER = brew.iter(model, "iter")
     LR = model.LearningRate(
         ITER, "LR", base_lr=-1e-8, policy="step", stepsize=10000, gamma=0.999)
     ONE = model.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0)
     for param in model.params:
         param_grad = model.param_to_grad[param]
         model.WeightedSum([param, ONE, param_grad, LR], param)


 def Benchmark(model_gen, arg):
     model, input_size = model_gen(arg.order, arg.cudnn_ws, arg.mkl)
     model.Proto().type = arg.net_type
     model.Proto().num_workers = arg.num_workers

     # In order to be able to run everything without feeding more stuff, let's
     # add the data and label blobs to the parameter initialization net as well.
     if arg.order == "NCHW":
         input_shape = [arg.batch_size, 3, input_size, input_size]
     else:
         input_shape = [arg.batch_size, input_size, input_size, 3]
     if arg.model == "MLP":
         input_shape = [arg.batch_size, input_size]

     model.param_init_net.GaussianFill(
         [],
         "data",
         shape=input_shape,
         mean=0.0,
         std=1.0
     )
     #MKL doesn't support int, so have to use numpy
     if arg.mkl:
         label = np.random.randint(low=0, high=1000, size=(arg.batch_size,)).astype(np.int32)
         workspace.FeedBlob("label", label)
     else:
         model.param_init_net.UniformIntFill(
             [],
             "label",
             shape=[arg.batch_size, ],
             min=0,
             max=999
         )

     if arg.forward_only:
         print('{}: running forward only.'.format(arg.model))
     else:
         if arg.mkl:
             print(
                 '==WARNING==\n'
                 'forward-backward not supported yet in MKL, so exiting'
             )
         print('{}: running forward-backward.'.format(arg.model))
         model.AddGradientOperators(["loss"])
         AddParameterUpdate(model)
         if arg.order == 'NHWC':
             print(
                 '==WARNING==\n'
                 'NHWC order with CuDNN may not be supported yet, so I might\n'
                 'exit suddenly.'
             )

     if not arg.cpu:
         if arg.mkl:
             model.param_init_net.RunAllOnMKL()
             model.net.RunAllOnMKL()
         else:
             model.param_init_net.RunAllOnGPU()
             model.net.RunAllOnGPU()

     if arg.engine:
         for op in model.net.Proto().op:
             op.engine = arg.engine

     if arg.dump_model:
         # Writes out the pbtxt for benchmarks on e.g. Android
         with open(
             "{0}_init_batch_{1}.pbtxt".format(arg.model, arg.batch_size), "w"
         ) as fid:
             fid.write(str(model.param_init_net.Proto()))
         with open("{0}.pbtxt".format(arg.model, arg.batch_size), "w") as fid:
             fid.write(str(model.net.Proto()))

     workspace.RunNetOnce(model.param_init_net)
     workspace.CreateNet(model.net)
     workspace.BenchmarkNet(
         model.net.Proto().name, arg.warmup_iterations, arg.iterations,
         arg.layer_wise_benchmark)


 def GetArgumentParser():
     parser = argparse.ArgumentParser(description="Caffe2 benchmark.")
     parser.add_argument(
         "--batch_size",
         type=int,
         default=128,
         help="The batch size."
     )
     parser.add_argument("--model", type=str, help="The model to benchmark.")
     parser.add_argument(
         "--order",
         type=str,
         default="NCHW",
         help="The order to evaluate."
     )
     parser.add_argument(
         "--cudnn_ws",
         type=int,
         help="The cudnn workspace size."
     )
     parser.add_argument(
         "--iterations",
         type=int,
         default=10,
         help="Number of iterations to run the network."
     )
     parser.add_argument(
         "--warmup_iterations",
         type=int,
         default=10,
         help="Number of warm-up iterations before benchmarking."
     )
     parser.add_argument(
         "--forward_only",
         action='store_true',
         help="If set, only run the forward pass."
     )
     parser.add_argument(
         "--layer_wise_benchmark",
         action='store_true',
         help="If True, run the layer-wise benchmark as well."
     )
     parser.add_argument(
         "--cpu",
         action='store_true',
         help="If True, run testing on CPU instead of GPU."
     )
     parser.add_argument(
         "--mkl",
         action='store_true',
         help="If True, run testing on CPU-MKL instead of GPU."
     )
     parser.add_argument(
         "--engine",
         type=str,
         default="",
         help="If set, blindly prefer the given engine(s) for every op.")
     parser.add_argument(
         "--dump_model",
         action='store_true',
         help="If True, dump the model prototxts to disk."
     )
     parser.add_argument("--net_type", type=str, default="simple")
     parser.add_argument("--num_workers", type=int, default=2)
     parser.add_argument("--use-nvtx", default=False, action='store_true')
     parser.add_argument("--htrace_span_log_path", type=str)
     return parser


 if __name__ == '__main__':
     args, extra_args = GetArgumentParser().parse_known_args()
     if (
         not args.batch_size or not args.model or not args.order
     ):
         GetArgumentParser().print_help()
     else:
         workspace.GlobalInit(
             ['caffe2', '--caffe2_log_level=0'] + extra_args +
             (['--caffe2_use_nvtx'] if args.use_nvtx else []) +
             (['--caffe2_htrace_span_log_path=' + args.htrace_span_log_path]
                 if args.htrace_span_log_path else []))

         model_map = {
             'AlexNet': AlexNet,
             'OverFeat': OverFeat,
             'VGGA': VGGA,
             'Inception': Inception,
             'ResNet50': ResNet50,
             'MLP': MLP,
         }
         Benchmark(model_map[args.model], args)
	# Copyright (c) 2016-present, Facebook, Inc.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	##############################################################################

	## @package convnet_benchmarks
	# Module caffe2.python.convnet_benchmarks
	from __future__ import absolute_import
	from __future__ import division
	from __future__ import print_function
	from __future__ import unicode_literals
	"""
	Benchmark for common convnets.

	Speed on Titan X, with 10 warmup steps and 10 main steps and with different
	versions of cudnn, are as follows (time reported below is per-batch time,
	forward / forward+backward):

	CuDNN V3 CuDNN v4
	AlexNet 32.5 / 108.0 27.4 / 90.1
	OverFeat 113.0 / 342.3 91.7 / 276.5
	Inception 134.5 / 485.8 125.7 / 450.6
	VGG (batch 64) 200.8 / 650.0 164.1 / 551.7

	Speed on Inception with varied batch sizes and CuDNN v4 is as follows:

	Batch Size Speed per batch Speed per image
	16 22.8 / 72.7 1.43 / 4.54
	32 38.0 / 127.5 1.19 / 3.98
	64 67.2 / 233.6 1.05 / 3.65
	128 125.7 / 450.6 0.98 / 3.52

	Speed on Tesla M40, which 10 warmup steps and 10 main steps and with cudnn
	v4, is as follows:

	AlexNet 68.4 / 218.1
	OverFeat 210.5 / 630.3
	Inception 300.2 / 1122.2
	VGG (batch 64) 405.8 / 1327.7

	(Note that these numbers involve a "full" backprop, i.e. the gradient
	with respect to the input image is also computed.)

	To get the numbers, simply run:

	for MODEL in AlexNet OverFeat Inception; do
	PYTHONPATH=../gen:$PYTHONPATH python convnet_benchmarks.py \
	--batch_size 128 --model $MODEL --forward_only True
	done
	for MODEL in AlexNet OverFeat Inception; do
	PYTHONPATH=../gen:$PYTHONPATH python convnet_benchmarks.py \
	--batch_size 128 --model $MODEL
	done
	PYTHONPATH=../gen:$PYTHONPATH python convnet_benchmarks.py \
	--batch_size 64 --model VGGA --forward_only True
	PYTHONPATH=../gen:$PYTHONPATH python convnet_benchmarks.py \
	--batch_size 64 --model VGGA

	for BS in 16 32 64 128; do
	PYTHONPATH=../gen:$PYTHONPATH python convnet_benchmarks.py \
	--batch_size $BS --model Inception --forward_only True
	PYTHONPATH=../gen:$PYTHONPATH python convnet_benchmarks.py \
	--batch_size $BS --model Inception
	done

	Note that VGG needs to be run at batch 64 due to memory limit on the backward
	pass.
	"""

	import argparse

	from caffe2.python import brew, cnn, workspace
	from caffe2.python.model_helper import ModelHelper

	from caffe2.python.models import resnet
	import numpy as np

	def MLP(order, cudnn_ws, mkl):
	model = ModelHelper(name="benchmark")
	d = 256
	depth = 20
	width = 3
	for i in range(depth):
	for j in range(width):
	current = "fc_{}_{}".format(i, j) if i > 0 else "data"
	next_ = "fc_{}_{}".format(i + 1, j)
	brew.fc(
	model,
	current, next_,
	dim_in=d, dim_out=d,
	weight_init=('XavierFill', {}),
	bias_init=('XavierFill', {}))

	brew.sum(model, ["fc_{}_{}".format(depth, j) for j in range(width)], ["sum"])
	brew.fc(model, "sum", "last",
	dim_in=d, dim_out=1000,
	weight_init=('XavierFill', {}),
	bias_init=('XavierFill', {}))
	xent = model.LabelCrossEntropy(["last", "label"], "xent")
	if not mkl:
	model.AveragedLoss(xent, "loss")
	return model, d


	def ResNet50(order, cudnn_ws, mkl):
	my_arg_scope = {'order': order, 'use_cudnn': True,
	'cudnn_exhaustive_search': True,
	'ws_nbytes_limit': str(cudnn_ws)}
	model = ModelHelper(name="alexnet", arg_scope=my_arg_scope)
	resnet.create_resnet50(model, "data", 3, 1000, is_test=True,
	final_avg_kernel=14)
	return model, 448

	def AlexNet(order, cudnn_ws, mkl):
	my_arg_scope = {'order': order, 'use_cudnn': True,
	'cudnn_exhaustive_search': True,
	'ws_nbytes_limit': str(cudnn_ws)}
	model = ModelHelper(name="alexnet", arg_scope=my_arg_scope)
	conv1 = brew.conv(
	model,
	"data",
	"conv1",
	3,
	64,
	11,
	('XavierFill', {}),
	('ConstantFill', {}),
	stride=4,
	pad=2
	)
	relu1 = brew.relu(model, conv1, "conv1")
	pool1 = brew.max_pool(model, relu1, "pool1", kernel=3, stride=2)
	conv2 = brew.conv(
	model,
	pool1,
	"conv2",
	64,
	192,
	5,
	('XavierFill', {}),
	('ConstantFill', {}),
	pad=2
	)
	relu2 = brew.relu(model, conv2, "conv2")
	pool2 = brew.max_pool(model, relu2, "pool2", kernel=3, stride=2)
	conv3 = brew.conv(
	model,
	pool2,
	"conv3",
	192,
	384,
	3,
	('XavierFill', {}),
	('ConstantFill', {}),
	pad=1
	)
	relu3 = brew.relu(model, conv3, "conv3")
	conv4 = brew.conv(
	model,
	relu3,
	"conv4",
	384,
	256,
	3,
	('XavierFill', {}),
	('ConstantFill', {}),
	pad=1
	)
	relu4 = brew.relu(model, conv4, "conv4")
	conv5 = brew.conv(
	model,
	relu4,
	"conv5",
	256,
	256,
	3,
	('XavierFill', {}),
	('ConstantFill', {}),
	pad=1
	)
	relu5 = brew.relu(model, conv5, "conv5")
	pool5 = brew.max_pool(model, relu5, "pool5", kernel=3, stride=2)
	fc6 = brew.fc(
	model, pool5, "fc6", 256 * 6 * 6, 4096, ('XavierFill', {}),
	('ConstantFill', {})
	)
	relu6 = brew.relu(model, fc6, "fc6")
	fc7 = brew.fc(
	model, relu6, "fc7", 4096, 4096, ('XavierFill', {}), ('ConstantFill', {})
	)
	relu7 = brew.relu(model, fc7, "fc7")
	fc8 = brew.fc(
	model, relu7, "fc8", 4096, 1000, ('XavierFill', {}), ('ConstantFill', {})
	)
	pred = brew.softmax(model, fc8, "pred")
	xent = model.LabelCrossEntropy([pred, "label"], "xent")
	if not mkl:
	loss = model.AveragedLoss(xent, "loss")
	return model, 224


	def OverFeat(order, cudnn_ws, mkl):
	my_arg_scope = {'order': order, 'use_cudnn': True,
	'cudnn_exhaustive_search': True,
	'ws_nbytes_limit': str(cudnn_ws)}
	model = ModelHelper(name='overfeat', arg_scope=my_arg_scope)
	conv1 = brew.conv(
	model,
	"data",
	"conv1",
	3,
	96,
	11,
	('XavierFill', {}),
	('ConstantFill', {}),
	stride=4
	)
	relu1 = brew.relu(model, conv1, "conv1")
	pool1 = brew.max_pool(model, relu1, "pool1", kernel=2, stride=2)
	conv2 = brew.conv(
	model, pool1, "conv2", 96, 256, 5, ('XavierFill', {}), ('ConstantFill', {})
	)
	relu2 = brew.relu(model, conv2, "conv2")
	pool2 = brew.max_pool(model, relu2, "pool2", kernel=2, stride=2)
	conv3 = brew.conv(
	model,
	pool2,
	"conv3",
	256,
	512,
	3,
	('XavierFill', {}),
	('ConstantFill', {}),
	pad=1
	)
	relu3 = brew.relu(model, conv3, "conv3")
	conv4 = brew.conv(
	model,
	relu3,
	"conv4",
	512,
	1024,
	3,
	('XavierFill', {}),
	('ConstantFill', {}),
	pad=1
	)
	relu4 = brew.relu(model, conv4, "conv4")
	conv5 = brew.conv(
	model,
	relu4,
	"conv5",
	1024,
	1024,
	3,
	('XavierFill', {}),
	('ConstantFill', {}),
	pad=1
	)
	relu5 = brew.relu(model, conv5, "conv5")
	pool5 = brew.max_pool(model, relu5, "pool5", kernel=2, stride=2)
	fc6 = brew.fc(
	model, pool5, "fc6", 1024 * 6 * 6, 3072, ('XavierFill', {}),
	('ConstantFill', {})
	)
	relu6 = brew.relu(model, fc6, "fc6")
	fc7 = brew.fc(
	model, relu6, "fc7", 3072, 4096, ('XavierFill', {}), ('ConstantFill', {})
	)
	relu7 = brew.relu(model, fc7, "fc7")
	fc8 = brew.fc(
	model, relu7, "fc8", 4096, 1000, ('XavierFill', {}), ('ConstantFill', {})
	)
	pred = brew.softmax(model, fc8, "pred")
	xent = model.LabelCrossEntropy([pred, "label"], "xent")
	if not mkl:
	loss = model.AveragedLoss(xent, "loss")
	return model, 231


	def VGGA(order, cudnn_ws, mkl):
	my_arg_scope = {'order': order, 'use_cudnn': True,
	'cudnn_exhaustive_search': True,
	'ws_nbytes_limit': str(cudnn_ws)}
	model = ModelHelper(name='vgg-a', arg_scope=my_arg_scope)
	conv1 = brew.conv(
	model,
	"data",
	"conv1",
	3,
	64,
	3,
	('XavierFill', {}),
	('ConstantFill', {}),
	pad=1
	)
	relu1 = brew.relu(model, conv1, "conv1")
	pool1 = brew.max_pool(model, relu1, "pool1", kernel=2, stride=2)
	conv2 = brew.conv(
	model,
	pool1,
	"conv2",
	64,
	128,
	3,
	('XavierFill', {}),
	('ConstantFill', {}),
	pad=1
	)
	relu2 = brew.relu(model, conv2, "conv2")
	pool2 = brew.max_pool(model, relu2, "pool2", kernel=2, stride=2)
	conv3 = brew.conv(
	model,
	pool2,
	"conv3",
	128,
	256,
	3,
	('XavierFill', {}),
	('ConstantFill', {}),
	pad=1
	)
	relu3 = brew.relu(model, conv3, "conv3")
	conv4 = brew.conv(
	model,
	relu3,
	"conv4",
	256,
	256,
	3,
	('XavierFill', {}),
	('ConstantFill', {}),
	pad=1
	)
	relu4 = brew.relu(model, conv4, "conv4")
	pool4 = brew.max_pool(model, relu4, "pool4", kernel=2, stride=2)
	conv5 = brew.conv(
	model,
	pool4,
	"conv5",
	256,
	512,
	3,
	('XavierFill', {}),
	('ConstantFill', {}),
	pad=1
	)
	relu5 = brew.relu(model, conv5, "conv5")
	conv6 = brew.conv(
	model,
	relu5,
	"conv6",
	512,
	512,
	3,
	('XavierFill', {}),
	('ConstantFill', {}),
	pad=1
	)
	relu6 = brew.relu(model, conv6, "conv6")
	pool6 = brew.max_pool(model, relu6, "pool6", kernel=2, stride=2)
	conv7 = brew.conv(
	model,
	pool6,
	"conv7",
	512,
	512,
	3,
	('XavierFill', {}),
	('ConstantFill', {}),
	pad=1
	)
	relu7 = brew.relu(model, conv7, "conv7")
	conv8 = brew.conv(
	model,
	relu7,
	"conv8",
	512,
	512,
	3,
	('XavierFill', {}),
	('ConstantFill', {}),
	pad=1
	)
	relu8 = brew.relu(model, conv8, "conv8")
	pool8 = brew.max_pool(model, relu8, "pool8", kernel=2, stride=2)

	fcix = brew.fc(
	model, pool8, "fcix", 512 * 7 * 7, 4096, ('XavierFill', {}),
	('ConstantFill', {})
	)
	reluix = brew.relu(model, fcix, "fcix")
	fcx = brew.fc(
	model, reluix, "fcx", 4096, 4096, ('XavierFill', {}), ('ConstantFill', {})
	)
	relux = brew.relu(model, fcx, "fcx")
	fcxi = brew.fc(
	model, relux, "fcxi", 4096, 1000, ('XavierFill', {}), ('ConstantFill', {})
	)
	pred = brew.softmax(model, fcxi, "pred")
	xent = model.LabelCrossEntropy([pred, "label"], "xent")
	if not mkl:
	loss = model.AveragedLoss(xent, "loss")
	return model, 231


	def _InceptionModule(
	model, input_blob, input_depth, output_name, conv1_depth, conv3_depths,
	conv5_depths, pool_depth
	):
	# path 1: 1x1 conv
	conv1 = brew.conv(
	model, input_blob, output_name + ":conv1", input_depth, conv1_depth, 1,
	('XavierFill', {}), ('ConstantFill', {})
	)
	conv1 = brew.relu(model, conv1, conv1)
	# path 2: 1x1 conv + 3x3 conv
	conv3_reduce = brew.conv(
	model, input_blob, output_name + ":conv3_reduce", input_depth,
	conv3_depths[0], 1, ('XavierFill', {}), ('ConstantFill', {})
	)
	conv3_reduce = brew.relu(model, conv3_reduce, conv3_reduce)
	conv3 = brew.conv(
	model,
	conv3_reduce,
	output_name + ":conv3",
	conv3_depths[0],
	conv3_depths[1],
	3,
	('XavierFill', {}),
	('ConstantFill', {}),
	pad=1
	)
	conv3 = brew.relu(model, conv3, conv3)
	# path 3: 1x1 conv + 5x5 conv
	conv5_reduce = brew.conv(
	model, input_blob, output_name + ":conv5_reduce", input_depth,
	conv5_depths[0], 1, ('XavierFill', {}), ('ConstantFill', {})
	)
	conv5_reduce = brew.relu(model, conv5_reduce, conv5_reduce)
	conv5 = brew.conv(
	model,
	conv5_reduce,
	output_name + ":conv5",
	conv5_depths[0],
	conv5_depths[1],
	5,
	('XavierFill', {}),
	('ConstantFill', {}),
	pad=2
	)
	conv5 = brew.relu(model, conv5, conv5)
	# path 4: pool + 1x1 conv
	pool = brew.max_pool(
	model,
	input_blob,
	output_name + ":pool",
	kernel=3,
	stride=1,
	pad=1
	)
	pool_proj = brew.conv(
	model, pool, output_name + ":pool_proj", input_depth, pool_depth, 1,
	('XavierFill', {}), ('ConstantFill', {})
	)
	pool_proj = brew.relu(model, pool_proj, pool_proj)
	output = brew.concat(model, [conv1, conv3, conv5, pool_proj], output_name)
	return output


	def Inception(order, cudnn_ws, mkl):
	my_arg_scope = {'order': order, 'use_cudnn': True,
	'cudnn_exhaustive_search': True,
	'ws_nbytes_limit': str(cudnn_ws)}
	model = ModelHelper(name="inception", arg_scope=my_arg_scope)
	conv1 = brew.conv(
	model,
	"data",
	"conv1",
	3,
	64,
	7,
	('XavierFill', {}),
	('ConstantFill', {}),
	stride=2,
	pad=3
	)
	relu1 = brew.relu(model, conv1, "conv1")
	pool1 = brew.max_pool(model, relu1, "pool1", kernel=3, stride=2, pad=1)
	conv2a = brew.conv(
	model, pool1, "conv2a", 64, 64, 1,
	('XavierFill', {}), ('ConstantFill', {})
	)
	conv2a = brew.relu(model, conv2a, conv2a)
	conv2 = brew.conv(
	model,
	conv2a,
	"conv2",
	64,
	192,
	3,
	('XavierFill', {}),
	('ConstantFill', {}),
	pad=1
	)
	relu2 = brew.relu(model, conv2, "conv2")
	pool2 = brew.max_pool(model, relu2, "pool2", kernel=3, stride=2, pad=1)
	# Inception modules
	inc3 = _InceptionModule(
	model, pool2, 192, "inc3", 64, [96, 128], [16, 32], 32
	)
	inc4 = _InceptionModule(
	model, inc3, 256, "inc4", 128, [128, 192], [32, 96], 64
	)
	pool5 = brew.max_pool(model, inc4, "pool5", kernel=3, stride=2, pad=1)
	inc5 = _InceptionModule(
	model, pool5, 480, "inc5", 192, [96, 208], [16, 48], 64
	)
	inc6 = _InceptionModule(
	model, inc5, 512, "inc6", 160, [112, 224], [24, 64], 64
	)
	inc7 = _InceptionModule(
	model, inc6, 512, "inc7", 128, [128, 256], [24, 64], 64
	)
	inc8 = _InceptionModule(
	model, inc7, 512, "inc8", 112, [144, 288], [32, 64], 64
	)
	inc9 = _InceptionModule(
	model, inc8, 528, "inc9", 256, [160, 320], [32, 128], 128
	)
	pool9 = brew.max_pool(model, inc9, "pool9", kernel=3, stride=2, pad=1)
	inc10 = _InceptionModule(
	model, pool9, 832, "inc10", 256, [160, 320], [32, 128], 128
	)
	inc11 = _InceptionModule(
	model, inc10, 832, "inc11", 384, [192, 384], [48, 128], 128
	)
	pool11 = brew.average_pool(model, inc11, "pool11", kernel=7, stride=1)
	fc = brew.fc(
	model, pool11, "fc", 1024, 1000,
	('XavierFill', {}), ('ConstantFill', {})
	)
	# It seems that Soumith's benchmark does not have softmax on top
	# for Inception. We will add it anyway so we can have a proper
	# backward pass.
	pred = brew.softmax(model, fc, "pred")
	xent = model.LabelCrossEntropy([pred, "label"], "xent")
	if not mkl:
	loss = model.AveragedLoss(xent, "loss")
	return model, 224


	def AddParameterUpdate(model):
	""" Simple plain SGD update -- not tuned to actually train the models """
	ITER = brew.iter(model, "iter")
	LR = model.LearningRate(
	ITER, "LR", base_lr=-1e-8, policy="step", stepsize=10000, gamma=0.999)
	ONE = model.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0)
	for param in model.params:
	param_grad = model.param_to_grad[param]
	model.WeightedSum([param, ONE, param_grad, LR], param)


	def Benchmark(model_gen, arg):
	model, input_size = model_gen(arg.order, arg.cudnn_ws, arg.mkl)
	model.Proto().type = arg.net_type
	model.Proto().num_workers = arg.num_workers

	# In order to be able to run everything without feeding more stuff, let's
	# add the data and label blobs to the parameter initialization net as well.
	if arg.order == "NCHW":
	input_shape = [arg.batch_size, 3, input_size, input_size]
	else:
	input_shape = [arg.batch_size, input_size, input_size, 3]
	if arg.model == "MLP":
	input_shape = [arg.batch_size, input_size]

	model.param_init_net.GaussianFill(
	[],
	"data",
	shape=input_shape,
	mean=0.0,
	std=1.0
	)
	#MKL doesn't support int, so have to use numpy
	if arg.mkl:
	label = np.random.randint(low=0, high=1000, size=(arg.batch_size,)).astype(np.int32)
	workspace.FeedBlob("label", label)
	else:
	model.param_init_net.UniformIntFill(
	[],
	"label",
	shape=[arg.batch_size, ],
	min=0,
	max=999
	)

	if arg.forward_only:
	print('{}: running forward only.'.format(arg.model))
	else:
	if arg.mkl:
	print(
	'==WARNING==\n'
	'forward-backward not supported yet in MKL, so exiting'
	)
	print('{}: running forward-backward.'.format(arg.model))
	model.AddGradientOperators(["loss"])
	AddParameterUpdate(model)
	if arg.order == 'NHWC':
	print(
	'==WARNING==\n'
	'NHWC order with CuDNN may not be supported yet, so I might\n'
	'exit suddenly.'
	)

	if not arg.cpu:
	if arg.mkl:
	model.param_init_net.RunAllOnMKL()
	model.net.RunAllOnMKL()
	else:
	model.param_init_net.RunAllOnGPU()
	model.net.RunAllOnGPU()

	if arg.engine:
	for op in model.net.Proto().op:
	op.engine = arg.engine

	if arg.dump_model:
	# Writes out the pbtxt for benchmarks on e.g. Android
	with open(
	"{0}_init_batch_{1}.pbtxt".format(arg.model, arg.batch_size), "w"
	) as fid:
	fid.write(str(model.param_init_net.Proto()))
	with open("{0}.pbtxt".format(arg.model, arg.batch_size), "w") as fid:
	fid.write(str(model.net.Proto()))

	workspace.RunNetOnce(model.param_init_net)
	workspace.CreateNet(model.net)
	workspace.BenchmarkNet(
	model.net.Proto().name, arg.warmup_iterations, arg.iterations,
	arg.layer_wise_benchmark)


	def GetArgumentParser():
	parser = argparse.ArgumentParser(description="Caffe2 benchmark.")
	parser.add_argument(
	"--batch_size",
	type=int,
	default=128,
	help="The batch size."
	)
	parser.add_argument("--model", type=str, help="The model to benchmark.")
	parser.add_argument(
	"--order",
	type=str,
	default="NCHW",
	help="The order to evaluate."
	)
	parser.add_argument(
	"--cudnn_ws",
	type=int,
	help="The cudnn workspace size."
	)
	parser.add_argument(
	"--iterations",
	type=int,
	default=10,
	help="Number of iterations to run the network."
	)
	parser.add_argument(
	"--warmup_iterations",
	type=int,
	default=10,
	help="Number of warm-up iterations before benchmarking."
	)
	parser.add_argument(
	"--forward_only",
	action='store_true',
	help="If set, only run the forward pass."
	)
	parser.add_argument(
	"--layer_wise_benchmark",
	action='store_true',
	help="If True, run the layer-wise benchmark as well."
	)
	parser.add_argument(
	"--cpu",
	action='store_true',
	help="If True, run testing on CPU instead of GPU."
	)
	parser.add_argument(
	"--mkl",
	action='store_true',
	help="If True, run testing on CPU-MKL instead of GPU."
	)
	parser.add_argument(
	"--engine",
	type=str,
	default="",
	help="If set, blindly prefer the given engine(s) for every op.")
	parser.add_argument(
	"--dump_model",
	action='store_true',
	help="If True, dump the model prototxts to disk."
	)
	parser.add_argument("--net_type", type=str, default="simple")
	parser.add_argument("--num_workers", type=int, default=2)
	parser.add_argument("--use-nvtx", default=False, action='store_true')
	parser.add_argument("--htrace_span_log_path", type=str)
	return parser


	if __name__ == '__main__':
	args, extra_args = GetArgumentParser().parse_known_args()
	if (
	not args.batch_size or not args.model or not args.order
	):
	GetArgumentParser().print_help()
	else:
	workspace.GlobalInit(
	['caffe2', '--caffe2_log_level=0'] + extra_args +
	(['--caffe2_use_nvtx'] if args.use_nvtx else []) +
	(['--caffe2_htrace_span_log_path=' + args.htrace_span_log_path]
	if args.htrace_span_log_path else []))

	model_map = {
	'AlexNet': AlexNet,
	'OverFeat': OverFeat,
	'VGGA': VGGA,
	'Inception': Inception,
	'ResNet50': ResNet50,
	'MLP': MLP,
	}
	Benchmark(model_map[args.model], args)