fbsync
diff --git a/build.py b/build.py
index f148b1d..1da6135 100644
--- a/build.py
+++ b/build.py
@@ -24,7 +24,7 @@
     # Otherwise, use the following line: we will build protobuf using the
     # included source file.
     #USE_SYSTEM_PROTOBUF = False
-    #PROTOC_BINARY = 'gen/third_party/protoc'
+    #PROTOC_BINARY = 'gen/third_party/google/protoc'
     # Note for the line above: if you are doing things like cross-compilation,
     # the built protoc compiler will not work on the host, in which case you
     # will need to provide a protoc binary that can run on the host environment.
diff --git a/build_android.py b/build_android.py
index b3aa329..5a71846 100644
--- a/build_android.py
+++ b/build_android.py
@@ -14,7 +14,7 @@
 from build import Config
 
 STANDALONE_TCHAIN_ROOT = (
-    '/opt/android_ndk/android-ndk-r10e/'
+    '/Users/jiayq/Research/android-ndk-r12b/'
     'standalone-toolchains/arm-linux-androideabi-4.9-android-21/')
 
 # We change necessary components in the Config class.
@@ -24,7 +24,7 @@
 Config.AR = STANDALONE_TCHAIN_ROOT + 'bin/arm-linux-androideabi-ar'
 Config.GENDIR = "gen-android"
 Config.USE_SYSTEM_PROTOBUF = False
-Config.PROTOC_BINARY = 'gen/third_party/protoc'
+Config.PROTOC_BINARY = 'gen/third_party/google/protoc'
 Config.USE_LITE_PROTO = False
 Config.USE_SYSTEM_EIGEN = False
 Config.USE_GLOG = False
diff --git a/build_android_prepare.py b/build_android_prepare.py
index 01975eb..ed306cf 100644
--- a/build_android_prepare.py
+++ b/build_android_prepare.py
@@ -13,7 +13,7 @@
     Brewery.Run(
         Config,
         ['build_android_prepare.py',
-         'build', '//third_party:protoc'])
+         'build', '//third_party/google:protoc'])
 else:
     print('This script is not intended to be used as an imported module.')
     sys.exit(1)
diff --git a/caffe2/binaries/convert_caffe_image_db.cc b/caffe2/binaries/convert_caffe_image_db.cc
index e1ac8db..e0e207d 100644
--- a/caffe2/binaries/convert_caffe_image_db.cc
+++ b/caffe2/binaries/convert_caffe_image_db.cc
@@ -28,7 +28,7 @@
   int count = 0;
   for (; cursor->Valid(); cursor->Next()) {
     caffe::Datum datum;
-    CHECK(datum.ParseFromString(cursor->value()));
+    CAFFE_ENFORCE(datum.ParseFromString(cursor->value()));
     TensorProtos protos;
     TensorProto* data = protos.add_protos();
     TensorProto* label = protos.add_protos();
diff --git a/caffe2/binaries/convert_encoded_to_raw_leveldb.cc b/caffe2/binaries/convert_encoded_to_raw_leveldb.cc
index 9a70638..aefc81f 100644
--- a/caffe2/binaries/convert_encoded_to_raw_leveldb.cc
+++ b/caffe2/binaries/convert_encoded_to_raw_leveldb.cc
@@ -45,7 +45,7 @@
     leveldb::DB* db_temp;
     leveldb::Status status = leveldb::DB::Open(
         options, input_db_name, &db_temp);
-    CHECK(status.ok()) << "Failed to open leveldb " << input_db_name << ".";
+    CAFFE_ENFORCE(status.ok(), "Failed to open leveldb ", input_db_name, ".");
     input_db.reset(db_temp);
   }
 
@@ -61,8 +61,11 @@
     leveldb::DB* db_temp;
     leveldb::Status status = leveldb::DB::Open(
         options, output_db_name, &db_temp);
-    CHECK(status.ok()) << "Failed to open leveldb " << output_db_name
-        << ". Is it already existing?";
+    CAFFE_ENFORCE(
+        status.ok(),
+        "Failed to open leveldb ",
+        output_db_name,
+        ". Is it already existing?");
     output_db.reset(db_temp);
   }
   batch.reset(new leveldb::WriteBatch());
@@ -84,7 +87,7 @@
   iter->SeekToFirst();
   int count = 0;
   for (; iter->Valid(); iter->Next()) {
-    CHECK(input_protos.ParseFromString(iter->value().ToString()));
+    CAFFE_ENFORCE(input_protos.ParseFromString(iter->value().ToString()));
     label->CopyFrom(input_protos.protos(1));
     const string& encoded_image = input_protos.protos(0).string_data(0);
     int encoded_size = encoded_image.size();
diff --git a/caffe2/binaries/fb_run_plan_mpi.cc b/caffe2/binaries/fb_run_plan_mpi.cc
index 204501e..822071a 100644
--- a/caffe2/binaries/fb_run_plan_mpi.cc
+++ b/caffe2/binaries/fb_run_plan_mpi.cc
@@ -62,7 +62,7 @@
   }
 
   caffe2::PlanDef plan_def;
-  CHECK(ReadProtoFromFile(caffe2::FLAGS_plan, &plan_def));
+  CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_plan, &plan_def));
   std::unique_ptr<caffe2::Workspace> workspace(new caffe2::Workspace());
   workspace->RunPlan(plan_def);
 
diff --git a/caffe2/binaries/inspect_gpus.cc b/caffe2/binaries/inspect_gpus.cc
index 4c58071..1a9cdc2 100644
--- a/caffe2/binaries/inspect_gpus.cc
+++ b/caffe2/binaries/inspect_gpus.cc
@@ -26,7 +26,7 @@
   }
 
   vector<vector<bool> > access_pattern;
-  CHECK(caffe2::GetCudaPeerAccessPattern(&access_pattern));
+  CAFFE_ENFORCE(caffe2::GetCudaPeerAccessPattern(&access_pattern));
 
   std::stringstream sstream;
   // Find topology
diff --git a/caffe2/binaries/make_cifar_db.cc b/caffe2/binaries/make_cifar_db.cc
index 15eb9ef..1c5ffd1 100644
--- a/caffe2/binaries/make_cifar_db.cc
+++ b/caffe2/binaries/make_cifar_db.cc
@@ -76,7 +76,7 @@
   LOG(INFO) << "Converting file " << filename;
   std::ifstream data_file(filename.c_str(),
       std::ios::in | std::ios::binary);
-  CHECK(data_file) << "Unable to open file " << filename;
+  CAFFE_ENFORCE(data_file, "Unable to open file ", filename);
   char str_buffer[kCIFARImageNBytes];
   int label_value;
   string serialized_protos;
diff --git a/caffe2/binaries/make_mnist_db.cc b/caffe2/binaries/make_mnist_db.cc
index 4d26c2b..7086ff6 100644
--- a/caffe2/binaries/make_mnist_db.cc
+++ b/caffe2/binaries/make_mnist_db.cc
@@ -32,8 +32,8 @@
   // Open files
   std::ifstream image_file(image_filename, std::ios::in | std::ios::binary);
   std::ifstream label_file(label_filename, std::ios::in | std::ios::binary);
-  CHECK(image_file) << "Unable to open file " << image_filename;
-  CHECK(label_file) << "Unable to open file " << label_filename;
+  CAFFE_ENFORCE(image_file, "Unable to open file ", image_filename);
+  CAFFE_ENFORCE(label_file, "Unable to open file ", label_filename);
   // Read the magic and the meta data
   uint32_t magic;
   uint32_t num_items;
diff --git a/caffe2/binaries/predictor_verifier.cc b/caffe2/binaries/predictor_verifier.cc
index feb39a3..56d9cd0 100644
--- a/caffe2/binaries/predictor_verifier.cc
+++ b/caffe2/binaries/predictor_verifier.cc
@@ -19,8 +19,8 @@
     LOG(FATAL) << "No predict net specified. Use --predict_net=/path/to/net.";
   }
   caffe2::NetDef init_net, predict_net;
-  CHECK(ReadProtoFromFile(FLAGS_init_net, &init_net));
-  CHECK(ReadProtoFromFile(FLAGS_predict_net, &predict_net));
+  CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_init_net, &init_net));
+  CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_predict_net, &predict_net));
   // Can be large due to constant fills
   VLOG(1) << "Init net: " << ProtoDebugString(init_net);
   LOG(INFO) << "Predict net: " << ProtoDebugString(predict_net);
diff --git a/caffe2/binaries/run_plan.cc b/caffe2/binaries/run_plan.cc
index a6b3e5c..7bc4c64 100644
--- a/caffe2/binaries/run_plan.cc
+++ b/caffe2/binaries/run_plan.cc
@@ -14,7 +14,7 @@
   }
   LOG(INFO) << "Loading plan: " << caffe2::FLAGS_plan;
   caffe2::PlanDef plan_def;
-  CHECK(ReadProtoFromFile(caffe2::FLAGS_plan, &plan_def));
+  CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_plan, &plan_def));
   std::unique_ptr<caffe2::Workspace> workspace(new caffe2::Workspace());
   workspace->RunPlan(plan_def);
 
diff --git a/caffe2/binaries/run_plan_mpi.cc b/caffe2/binaries/run_plan_mpi.cc
index a439d3d..dda9a8f 100644
--- a/caffe2/binaries/run_plan_mpi.cc
+++ b/caffe2/binaries/run_plan_mpi.cc
@@ -21,7 +21,7 @@
   caffe2::GlobalInit(&argc, &argv);
   LOG(INFO) << "Loading plan: " << caffe2::FLAGS_plan;
   caffe2::PlanDef plan_def;
-  CHECK(ReadProtoFromFile(caffe2::FLAGS_plan, &plan_def));
+  CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_plan, &plan_def));
   std::unique_ptr<caffe2::Workspace> workspace(new caffe2::Workspace());
   workspace->RunPlan(plan_def);
 
diff --git a/caffe2/binaries/speed_benchmark.cc b/caffe2/binaries/speed_benchmark.cc
index c794aa8..5f58169 100644
--- a/caffe2/binaries/speed_benchmark.cc
+++ b/caffe2/binaries/speed_benchmark.cc
@@ -16,12 +16,12 @@
   std::unique_ptr<caffe2::Workspace> workspace(new caffe2::Workspace());
   // Run initialization network.
   caffe2::NetDef net_def;
-  CHECK(ReadProtoFromFile(caffe2::FLAGS_init_net, &net_def));
-  CHECK(workspace->RunNetOnce(net_def));
-  CHECK(ReadProtoFromFile(caffe2::FLAGS_net, &net_def));
+  CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_init_net, &net_def));
+  CAFFE_ENFORCE(workspace->RunNetOnce(net_def));
+  CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_net, &net_def));
   caffe2::NetBase* net = workspace->CreateNet(net_def);
   CHECK_NOTNULL(net);
-  CHECK(net->Run());
+  CAFFE_ENFORCE(net->Run());
   net->TEST_Benchmark(caffe2::FLAGS_warmup, caffe2::FLAGS_iter, caffe2::FLAGS_run_individual);
   return 0;
 }
diff --git a/caffe2/binaries/zmq_feeder.cc b/caffe2/binaries/zmq_feeder.cc
index d2fcfb6..fc44657 100644
--- a/caffe2/binaries/zmq_feeder.cc
+++ b/caffe2/binaries/zmq_feeder.cc
@@ -23,8 +23,10 @@
   LOG(INFO) << "Opening DB...";
   auto in_db = caffe2::db::CreateDB(
       caffe2::FLAGS_input_db_type, caffe2::FLAGS_input_db, caffe2::db::READ);
-  CHECK(in_db) << "Cannot load input db " << caffe2::FLAGS_input_db
-               << " of expected type " << caffe2::FLAGS_input_db_type;
+  CAFFE_ENFORCE(
+      in_db,
+      "Cannot load input db " + caffe2::FLAGS_input_db + " of expected type " +
+          caffe2::FLAGS_input_db_type);
   auto cursor = in_db->NewCursor();
   LOG(INFO) << "DB opened.";
 
diff --git a/caffe2/contrib/nervana/nervana_init_gpu.cc b/caffe2/contrib/nervana/nervana_init_gpu.cc
index 81dfa20..994fc97 100644
--- a/caffe2/contrib/nervana/nervana_init_gpu.cc
+++ b/caffe2/contrib/nervana/nervana_init_gpu.cc
@@ -29,10 +29,10 @@
     VLOG(1) << "Loaded nervana kernels from path "
                   << FLAGS_nervana_cubin_path;
   } else {
-    // Since this is not a critical error we will just log it in info.
-    LOG(INFO) << "Cannot load nervana gpu kernels from path "
-              << FLAGS_nervana_cubin_path
-              << ", will disable Caffe2 nervana engines.";
+    // Since this is not a critical error we will just vlog it.
+    VLOG(1) << "Cannot load nervana gpu kernels from path "
+            << FLAGS_nervana_cubin_path
+            << ", will disable Caffe2 nervana engines.";
   }
   // We will always return true for this initialization, because the loading
   // result is kept and accessible via NervanaKernelLoaded(). This allows us
diff --git a/caffe2/contrib/nervana/nervana_math_gpu.cc b/caffe2/contrib/nervana/nervana_math_gpu.cc
index d23cbcb..f3010b9 100644
--- a/caffe2/contrib/nervana/nervana_math_gpu.cc
+++ b/caffe2/contrib/nervana/nervana_math_gpu.cc
@@ -21,9 +21,24 @@
   int ldb = (TransB == CblasNoTrans) ? N : K;
   bool a_t = (TransA == CblasTrans);
   bool b_t = (TransB == CblasTrans);
-  CHECK(nervana_sgemm(
-      const_cast<float*>(A), const_cast<float*>(B), C, a_t, b_t, M, N, K,
-      lda, ldb, N, alpha, beta, nullptr, false, false, context->cuda_stream()));
+  CAFFE_ENFORCE(nervana_sgemm(
+      const_cast<float*>(A),
+      const_cast<float*>(B),
+      C,
+      a_t,
+      b_t,
+      M,
+      N,
+      K,
+      lda,
+      ldb,
+      N,
+      alpha,
+      beta,
+      nullptr,
+      false,
+      false,
+      context->cuda_stream()));
 }
 
 }  // namespace math
diff --git a/caffe2/contrib/nnpack/nnpack_ops.cc b/caffe2/contrib/nnpack/nnpack_ops.cc
index 374015d..d3931eb 100644
--- a/caffe2/contrib/nnpack/nnpack_ops.cc
+++ b/caffe2/contrib/nnpack/nnpack_ops.cc
@@ -72,10 +72,10 @@
             OperatorBase::GetSingleArgument<std::string>("algo", "AUTO"))),
         kts_(get_nnp_convolution_transform_strategy(
             OperatorBase::GetSingleArgument<std::string>("kts", "TUPLE"))) {
-    CAFFE_ENFORCE(
+    OPERATOR_NEEDS_FEATURE(
         this->order_ == StorageOrder::NCHW,
-        "NNPack only supports NCHW order. Please consider add \
-            TransposeOp with axes=[0, 3, 1, 2] before NNPack Conv.");
+        "NNPack only supports NCHW order. Please consider adding "
+        "TransposeOp with axes=[0, 3, 1, 2] before NNPack Conv.");
   }
 
   bool RunOnDeviceWithOrderNCHW() override;
@@ -176,28 +176,28 @@
  public:
   NNPACKMaxPoolOp(const OperatorDef& operator_def, Workspace* ws)
       : ConvPoolOpBase<CPUContext>(operator_def, ws) {
-    CAFFE_ENFORCE(
+    OPERATOR_NEEDS_FEATURE(
         this->order_ == StorageOrder::NCHW,
-        "NNPack only supports NCHW order. Please consider add \
-            TransposeOp with axes=[0, 3, 1, 2] before NNPack Conv.");
-    CAFFE_ENFORCE(
+        "NNPack only supports NCHW order. Please consider add "
+        "TransposeOp with axes=[0, 3, 1, 2] before NNPack Conv.");
+    OPERATOR_NEEDS_FEATURE(
         this->kernel_h_ == 2, "NNPack only supports MaxPool kernel size 2*2!");
-    CAFFE_ENFORCE(
+    OPERATOR_NEEDS_FEATURE(
         this->kernel_w_ == 2, "NNPack only supports MaxPool kernel size 2*2!");
-    CAFFE_ENFORCE(
+    OPERATOR_NEEDS_FEATURE(
         this->stride_h_ == 2, "NNPack only supports MaxPool stride size 2*2!");
-    CAFFE_ENFORCE(
+    OPERATOR_NEEDS_FEATURE(
         this->stride_w_ == 2, "NNPack only supports MaxPool stride size 2*2!");
-    CAFFE_ENFORCE(
+    OPERATOR_NEEDS_FEATURE(
         this->pad_t_ == 0,
         "NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
-    CAFFE_ENFORCE(
+    OPERATOR_NEEDS_FEATURE(
         this->pad_l_ == 0,
         "NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
-    CAFFE_ENFORCE(
+    OPERATOR_NEEDS_FEATURE(
         this->pad_r_ == 0,
         "NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
-    CAFFE_ENFORCE(
+    OPERATOR_NEEDS_FEATURE(
         this->pad_b_ == 0,
         "NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
   }
diff --git a/caffe2/contrib/nnpack/nnpack_ops_test.py b/caffe2/contrib/nnpack/nnpack_ops_test.py
index 5205f3b..5316c4b 100644
--- a/caffe2/contrib/nnpack/nnpack_ops_test.py
+++ b/caffe2/contrib/nnpack/nnpack_ops_test.py
@@ -9,9 +9,12 @@
 import numpy as np
 import time
 import os
-from caffe2.python import core
+from caffe2.python import core, dyndep
 import caffe2.python.hypothesis_test_util as hu
 
+
+dyndep.InitOpsLibrary("@/caffe2/caffe2/contrib/nnpack:nnpack_ops")
+
 np.random.seed(1)
 
 
diff --git a/caffe2/contrib/torch/torch_op.h b/caffe2/contrib/torch/torch_op.h
index 037b4db..4669392 100644
--- a/caffe2/contrib/torch/torch_op.h
+++ b/caffe2/contrib/torch/torch_op.h
@@ -49,9 +49,10 @@
   }
 
   static const char* tensorTy(const Blob& blob) {
-    CHECK(blob.template IsType<Tensor<Context>>());
+    CAFFE_ENFORCE(blob.template IsType<Tensor<Context>>());
     const auto& tc = blob.template Get<Tensor<Context>>();
-    CHECK(tc.template IsType<float>()) << tc.meta().name() << ", " << tc.size();
+    CAFFE_ENFORCE(
+        tc.template IsType<float>() + tc.meta().name(), ", ", tc.size());
     return Traits::tensorTy;
   }
 
@@ -141,7 +142,7 @@
     auto* thDst = static_cast<typename Traits::Tensor*>(torchDst);
     auto* tcDst = dst->template GetMutable<Tensor<Context>>();
     CHECK_NOTNULL(src->storage->data);
-    CHECK(src->storage->size);
+    CAFFE_ENFORCE(src->storage->size);
     CHECK_EQ(src->storage->data, thDst->storage->data);
     CHECK_EQ(src->storage->data, tcDst->template data<float>());
     CHECK_EQ(src->storage->size, thDst->storage->size);
@@ -162,10 +163,10 @@
       return;
     }
 
-    CHECK(lua_istable(L(), -1));
+    CAFFE_ENFORCE(lua_istable(L(), -1));
     lua_pushnil(L());
     for (auto i = 0; i < blobs.size(); ++i) {
-      CHECK(lua_next(L(), -2));
+      CAFFE_ENFORCE(lua_next(L(), -2));
       verifyOutput(blobs[i], tensors[i]);
       lua_pop(L(), 1);
     }
@@ -264,7 +265,8 @@
       lua_pushnil(L);
       int i = 0;
       while (lua_next(L, -3) && i < paramBlobs.size()) {
-        CHECK(luaT_isudata(L, -1, torch::Torch<Context>::Traits::tensorTy));
+        CAFFE_ENFORCE(
+            luaT_isudata(L, -1, torch::Torch<Context>::Traits::tensorTy));
         auto* param =
             static_cast<typename torch::Torch<Context>::Traits::Tensor*>(
                 luaT_toudata(L, -1, torch::Torch<Context>::Traits::tensorTy));
@@ -275,7 +277,7 @@
           tc->Resize(paramShape);
           tc->template mutable_data<float>();
         } else {
-          CHECK(tc->dims() == paramShape);
+          CAFFE_ENFORCE(tc->dims() == paramShape);
         }
         lua_pop(L, 1);
         i++;
@@ -286,7 +288,8 @@
     lua_getfield(L, -1, "output");
     if (outputBlobs.size() == 0) {
     } else if (outputBlobs.size() == 1) {
-      CHECK(luaT_isudata(L, -1, torch::Torch<Context>::Traits::tensorTy));
+      CAFFE_ENFORCE(
+          luaT_isudata(L, -1, torch::Torch<Context>::Traits::tensorTy));
       auto* output =
           static_cast<typename torch::Torch<Context>::Traits::Tensor*>(
               luaT_toudata(L, -1, torch::Torch<Context>::Traits::tensorTy));
@@ -299,7 +302,8 @@
       lua_pushnil(L);
       auto i = 0;
       while (lua_next(L, -2) && i < outputBlobs.size()) {
-        CHECK(luaT_isudata(L, -1, torch::Torch<Context>::Traits::tensorTy));
+        CAFFE_ENFORCE(
+            luaT_isudata(L, -1, torch::Torch<Context>::Traits::tensorTy));
         auto* output =
             static_cast<typename torch::Torch<Context>::Traits::Tensor*>(
                 luaT_toudata(L, -1, torch::Torch<Context>::Traits::tensorTy));
@@ -310,7 +314,7 @@
           tc->Resize(outputShape);
           tc->template mutable_data<float>();
         } else {
-          CHECK(tc->dims() == outputShape);
+          CAFFE_ENFORCE(tc->dims() == outputShape);
         }
         ++i;
       }
@@ -385,8 +389,9 @@
       lua_pushnil(L);
       auto i = 0;
       while (lua_next(L, -2) && i < numParams) {
-        CHECK(luaT_isudata(L, -1, state_.tensorTy(*paramBlobs[i])))
-            << luaT_typename(L, -1);
+        CAFFE_ENFORCE(
+            luaT_isudata(L, -1, state_.tensorTy(*paramBlobs[i])),
+            luaT_typename(L, -1));
         auto* udata = luaT_toudata(L, -1, state_.tensorTy(*paramBlobs[i]));
         state_.setTensor(
             static_cast<typename torch::Torch<Context>::Traits::Tensor*>(udata),
@@ -517,7 +522,7 @@
       lua_pushnil(L);
       auto i = 0;
       while (lua_next(L, -3) && i < numParams) {
-        CHECK(luaT_isudata(L, -1, state_.tensorTy(*paramBlobs[i])));
+        CAFFE_ENFORCE(luaT_isudata(L, -1, state_.tensorTy(*paramBlobs[i])));
         auto* udata = luaT_toudata(L, -1, state_.tensorTy(*paramBlobs[i]));
         state_.setTensor(
             static_cast<typename torch::Torch<Context>::Traits::Tensor*>(udata),
@@ -530,7 +535,7 @@
       lua_pushnil(L);
       i = 0;
       while (lua_next(L, -2) && i < numParams) {
-        CHECK(luaT_isudata(L, -1, state_.tensorTy(*gradParamBlobs[i])));
+        CAFFE_ENFORCE(luaT_isudata(L, -1, state_.tensorTy(*gradParamBlobs[i])));
         auto* udata = luaT_toudata(L, -1, state_.tensorTy(*gradParamBlobs[i]));
         state_.setTensor(
             static_cast<typename torch::Torch<Context>::Traits::Tensor*>(udata),
diff --git a/caffe2/contrib/torch/torch_op_gpu.cpp b/caffe2/contrib/torch/torch_op_gpu.cpp
index 9d64365..14f2ac8 100644
--- a/caffe2/contrib/torch/torch_op_gpu.cpp
+++ b/caffe2/contrib/torch/torch_op_gpu.cpp
@@ -29,9 +29,9 @@
 THCState* cudaState(Torch<CUDAContext>* t) {
   auto* L = t->L();
   lua_getglobal(L, "cutorch");
-  CHECK(!lua_isnil(L, -1));
+  CAFFE_ENFORCE(!lua_isnil(L, -1));
   lua_getfield(L, -1, "_state");
-  CHECK(!lua_isnil(L, -1));
+  CAFFE_ENFORCE(!lua_isnil(L, -1));
   THCState* state = reinterpret_cast<THCState*>(lua_touserdata(L, -1));
   lua_pop(L, 2);
   return state;
diff --git a/caffe2/core/blob.h b/caffe2/core/blob.h
index a7223ae..2324286 100644
--- a/caffe2/core/blob.h
+++ b/caffe2/core/blob.h
@@ -152,7 +152,8 @@
    */
   void Serialize(
       const string& name,
-      BlobSerializerBase::SerializationAcceptor acceptor) const;
+      BlobSerializerBase::SerializationAcceptor acceptor,
+      int chunk_size = -1) const;
 
   /**
    * @brief Convenience function to serialize a blob to a string.
diff --git a/caffe2/core/blob_gpu_test.cc b/caffe2/core/blob_gpu_test.cc
index 0eba012..5d8d967 100644
--- a/caffe2/core/blob_gpu_test.cc
+++ b/caffe2/core/blob_gpu_test.cc
@@ -134,7 +134,7 @@
     blob.GetMutable<TensorCUDA>()->CopyFrom(cpu_tensor);                   \
     string serialized = blob.Serialize("test");                            \
     BlobProto proto;                                                       \
-    CHECK(proto.ParseFromString(serialized));                              \
+    CAFFE_ENFORCE(proto.ParseFromString(serialized));                      \
     EXPECT_EQ(proto.name(), "test");                                       \
     EXPECT_EQ(proto.type(), "Tensor");                                     \
     EXPECT_TRUE(proto.has_tensor());                                       \
@@ -183,7 +183,7 @@
     blob.Reset(new TensorCUDA(tensor, &context));
     string serialized = blob.Serialize("test");
     BlobProto proto;
-    CHECK(proto.ParseFromString(serialized));
+    CAFFE_ENFORCE(proto.ParseFromString(serialized));
     EXPECT_EQ(proto.name(), "test");
     EXPECT_TRUE(proto.has_tensor());
     const TensorProto& tensor_proto = proto.tensor();
diff --git a/caffe2/core/blob_serialization.cc b/caffe2/core/blob_serialization.cc
index 72cd57e..c99effc 100644
--- a/caffe2/core/blob_serialization.cc
+++ b/caffe2/core/blob_serialization.cc
@@ -30,7 +30,7 @@
       const Blob& blob,
       const string& name,
       SerializationAcceptor acceptor) override {
-    CHECK(blob.IsType<std::string>());
+    CAFFE_ENFORCE(blob.IsType<std::string>());
 
     BlobProto blob_proto;
     blob_proto.set_name(name);
@@ -72,10 +72,11 @@
 // The blob serialization member function implementation.
 void Blob::Serialize(
     const string& name,
-    BlobSerializerBase::SerializationAcceptor acceptor) const {
+    BlobSerializerBase::SerializationAcceptor acceptor,
+    int chunk_size) const {
   std::unique_ptr<BlobSerializerBase> serializer(CreateSerializer(meta_.id()));
   CAFFE_ENFORCE(serializer, "No known serializer for ", meta_.name());
-  serializer->Serialize(*this, name, acceptor);
+  serializer->SerializeWithChunkSize(*this, name, acceptor, chunk_size);
 }
 
 // The blob serialization member function implementation.
diff --git a/caffe2/core/blob_serialization.h b/caffe2/core/blob_serialization.h
index 189e336..6a448c9 100644
--- a/caffe2/core/blob_serialization.h
+++ b/caffe2/core/blob_serialization.h
@@ -49,6 +49,12 @@
       const Blob& blob,
       const string& name,
       SerializationAcceptor acceptor) override;
+  void SerializeWithChunkSize(
+      const Blob& blob,
+      const string& name,
+      SerializationAcceptor acceptor,
+      int chunk_size) override;
+
   void Serialize(const Tensor<Context>& tensor, const string& name,
                  TensorProto* proto, size_t chunkBegin, int32_t chunkSize);
 
@@ -175,15 +181,26 @@
     const Blob& blob,
     const string& name,
     BlobSerializerBase::SerializationAcceptor acceptor) {
-  CHECK(blob.IsType<Tensor<Context>>());
+  this->SerializeWithChunkSize(
+      blob, name, acceptor, FLAGS_caffe2_tensor_chunk_size);
+}
+
+template <class Context>
+void TensorSerializer<Context>::SerializeWithChunkSize(
+    const Blob& blob,
+    const string& name,
+    BlobSerializerBase::SerializationAcceptor acceptor,
+    int chunk_size) {
+  CAFFE_ENFORCE(blob.IsType<Tensor<Context>>());
   const auto& tensor = blob.template Get<Tensor<Context>>();
+  chunk_size = chunk_size == -1 ? FLAGS_caffe2_tensor_chunk_size : chunk_size;
 
 #ifndef __ANDROID__
   std::vector<std::future<void>> futures;
 #endif
 
   for (size_t chunkBegin = 0; chunkBegin < tensor.size();
-       chunkBegin += FLAGS_caffe2_tensor_chunk_size) {
+       chunkBegin += chunk_size) {
     auto task = [&](size_t chunkBegin) {
       BlobProto blob_proto;
       blob_proto.set_name(name);
@@ -191,15 +208,11 @@
       TensorProto& proto = *blob_proto.mutable_tensor();
       proto.set_name(name);
       this->Serialize(
-          tensor,
-          name,
-          blob_proto.mutable_tensor(),
-          chunkBegin,
-          FLAGS_caffe2_tensor_chunk_size);
+          tensor, name, blob_proto.mutable_tensor(), chunkBegin, chunk_size);
       acceptor(name, blob_proto.SerializeAsString());
     };
 #ifndef __ANDROID__
-    if (tensor.size() > FLAGS_caffe2_tensor_chunk_size) {
+    if (tensor.size() > chunk_size) {
       futures.emplace_back(std::async(std::launch::async, task, chunkBegin));
     } else {
       // Sync mode for small tensors
@@ -224,11 +237,17 @@
     const Tensor<Context>& input, const string& name,
     TensorProto* proto_ptr, size_t chunkBegin, int32_t chunkSize) {
   CAFFE_ENFORCE(
-    chunkBegin < input.size(),
-    "Chunk begin is out of tensor: ",
-    chunkBegin,
-    ' ',
-    input.size());
+      chunkBegin < input.size(),
+      "Chunk begin is out of tensor: ",
+      chunkBegin,
+      ' ',
+      input.size());
+  CAFFE_ENFORCE(
+      input.raw_data(),
+      "The input does not have data input yet. This is probably because you "
+      "created a tensor of non-zero shape but never filled its data via "
+      "mutable_data() calls. This means that it makes no sense to serialize "
+      "the tensor content.");
   if (chunkBegin + chunkSize > input.size()) {
     chunkSize = input.size() - chunkBegin;
   }
diff --git a/caffe2/core/blob_serializer_base.h b/caffe2/core/blob_serializer_base.h
index 1245dd9..c1d2802 100644
--- a/caffe2/core/blob_serializer_base.h
+++ b/caffe2/core/blob_serializer_base.h
@@ -38,6 +38,15 @@
    */
   virtual void Serialize(const Blob& blob, const std::string& name,
                         SerializationAcceptor acceptor) = 0;
+
+  virtual void SerializeWithChunkSize(
+      const Blob& blob,
+      const std::string& name,
+      SerializationAcceptor acceptor,
+      int chunk_size) {
+    // Base implementation.
+    Serialize(blob, name, acceptor);
+  }
 };
 
 } // namespace caffe2
diff --git a/caffe2/core/blob_test.cc b/caffe2/core/blob_test.cc
index 8ec62da..18fae4f 100644
--- a/caffe2/core/blob_test.cc
+++ b/caffe2/core/blob_test.cc
@@ -25,6 +25,12 @@
 
 class BlobTestFoo {};
 class BlobTestBar {};
+}
+
+CAFFE_KNOWN_TYPE(BlobTestFoo);
+CAFFE_KNOWN_TYPE(BlobTestBar);
+
+namespace {
 
 TEST(BlobTest, Blob) {
   Blob blob;
@@ -260,10 +266,10 @@
   }
 }
 
-TYPED_TEST(TensorCPUDeathTest, CannotShareDataWhenShapeNotSet) {
+TYPED_TEST(TensorCPUTest, CannotShareDataWhenShapeNotSet) {
   std::unique_ptr<TypeParam[]> raw_buffer(new TypeParam[10]);
   TensorCPU tensor;
-  EXPECT_DEATH(tensor.ShareExternalPointer(raw_buffer.get()), "");
+  ASSERT_THROW(tensor.ShareExternalPointer(raw_buffer.get()), EnforceNotMet);
 }
 
 TYPED_TEST(TensorCPUTest, TensorShareDataCanUseDifferentShapes) {
@@ -581,5 +587,29 @@
   }
 }
 
+TEST(CustomChunkSize, BigTensorSerialization) {
+  int64_t d1 = 2;
+  int64_t d2 = FLAGS_caffe2_test_big_tensor_size
+      ? FLAGS_caffe2_test_big_tensor_size / d1
+      : static_cast<int64_t>(std::numeric_limits<int>::max()) + 1;
+  int64_t size = d1 * d2;
+
+  Blob blob;
+  TensorCPU* tensor = blob.GetMutable<TensorCPU>();
+  tensor->Resize(d1, d2);
+  tensor->mutable_data<float>();
+  std::mutex mutex;
+  int counter = 0;
+  auto acceptor = [&](const std::string& key, const std::string& value) {
+    std::lock_guard<std::mutex> guard(mutex);
+    counter++;
+  };
+  blob.Serialize("test", acceptor, size);
+  EXPECT_EQ(counter, 1);
+
+  counter = 0;
+  blob.Serialize("test", acceptor, (size / 2) + 1);
+  EXPECT_EQ(counter, 2);
+}
 } // namespace
 } // namespace caffe2
diff --git a/caffe2/core/context_gpu.cu b/caffe2/core/context_gpu.cu
index 5e46c8d..90e28f8 100644
--- a/caffe2/core/context_gpu.cu
+++ b/caffe2/core/context_gpu.cu
@@ -8,6 +8,7 @@
 #include "caffe2/core/context_gpu.h"
 #include "caffe2/core/init.h"
 #include "caffe2/core/logging.h"
+#include "caffe2/core/tensor.h"
 #include "caffe2/utils/string_utils.h"
 
 
@@ -43,6 +44,8 @@
 
 namespace caffe2 {
 
+CAFFE_KNOWN_TYPE(Tensor<CUDAContext>);
+
 thread_local ThreadLocalCUDAObjects CUDAContext::cuda_objects_;
 
 // Static global variables for setting up the memory pool.
diff --git a/caffe2/core/db.cc b/caffe2/core/db.cc
index 8ef5a51..9e59951 100644
--- a/caffe2/core/db.cc
+++ b/caffe2/core/db.cc
@@ -6,6 +6,10 @@
 #include "caffe2/core/logging.h"
 
 namespace caffe2 {
+
+CAFFE_KNOWN_TYPE(db::DBReader);
+CAFFE_KNOWN_TYPE(db::Cursor);
+
 namespace db {
 
 CAFFE_DEFINE_REGISTRY(Caffe2DBRegistry, DB, const string&, Mode);
@@ -30,7 +34,7 @@
 
   void SeekToFirst() override {
     fseek(file_, 0, SEEK_SET);
-    CHECK(!feof(file_)) << "Hmm, empty file?";
+    CAFFE_ENFORCE(!feof(file_), "Hmm, empty file?");
     // Read the first item.
     valid_ = true;
     Next();
@@ -64,12 +68,12 @@
   }
 
   string key() override {
-    CHECK(valid_) << "Cursor is at invalid location!";
+    CAFFE_ENFORCE(valid_, "Cursor is at invalid location!");
     return string(key_.data(), key_len_);
   }
 
   string value() override {
-    CHECK(valid_) << "Cursor is at invalid location!";
+    CAFFE_ENFORCE(valid_, "Cursor is at invalid location!");
     return string(value_.data(), value_len_);
   }
 
@@ -133,7 +137,7 @@
         file_ = fopen(source.c_str(), "rb");
         break;
     }
-    CHECK(file_) << "Cannot open file: " << source;
+    CAFFE_ENFORCE(file_, "Cannot open file: " + source);
     VLOG(1) << "Opened MiniDB " << source;
   }
   ~MiniDB() { Close(); }
@@ -151,7 +155,7 @@
   }
 
   unique_ptr<Transaction> NewTransaction() override {
-    CHECK(this->mode_ == NEW || this->mode_ == WRITE);
+    CAFFE_ENFORCE(this->mode_ == NEW || this->mode_ == WRITE);
     return make_unique<MiniDBTransaction>(file_, &file_access_mutex_);
   }
 
@@ -169,7 +173,7 @@
     const Blob& blob,
     const string& name,
     BlobSerializerBase::SerializationAcceptor acceptor) {
-  CHECK(blob.IsType<DBReader>());
+  CAFFE_ENFORCE(blob.IsType<DBReader>());
   auto& reader = blob.Get<DBReader>();
   DBReaderProto proto;
   proto.set_name(name);
diff --git a/caffe2/core/db.h b/caffe2/core/db.h
index 530f2c3..9d92c89 100644
--- a/caffe2/core/db.h
+++ b/caffe2/core/db.h
@@ -130,8 +130,12 @@
   friend class DBReaderSerializer;
   DBReader() {}
 
-  DBReader(const string& db_type, const string& source) {
-    Open(db_type, source);
+  DBReader(
+      const string& db_type,
+      const string& source,
+      const int32_t num_shards = 1,
+      const int32_t shard_id = 0) {
+    Open(db_type, source, num_shards, shard_id);
   }
 
   explicit DBReader(const DBReaderProto& proto) {
@@ -142,6 +146,8 @@
           "does not support it.");
       cursor_->Seek(proto.key());
     }
+    num_shards_ = 1;
+    shard_id_ = 0;
   }
 
   explicit DBReader(std::unique_ptr<DB> db)
@@ -152,7 +158,11 @@
     cursor_ = db_->NewCursor();
   }
 
-  void Open(const string& db_type, const string& source) {
+  void Open(
+      const string& db_type,
+      const string& source,
+      const int32_t num_shards = 1,
+      const int32_t shard_id = 0) {
     // Note(jiayq): resetting is needed when we re-open e.g. leveldb where no
     // concurrent access is allowed.
     cursor_.reset();
@@ -162,9 +172,16 @@
     db_ = CreateDB(db_type_, source_, READ);
     CAFFE_ENFORCE(db_,
         "Cannot open db: ", source_, " of type ", db_type_);
+    CAFFE_ENFORCE(num_shards >= 1);
+    CAFFE_ENFORCE(shard_id >= 0);
+    CAFFE_ENFORCE(shard_id < num_shards);
+    num_shards_ = num_shards;
+    shard_id_ = shard_id;
     cursor_ = db_->NewCursor();
+    SeekToFirst();
   }
 
+ public:
   /**
    * Read a set of key and value from the db and move to next. Thread safe.
    *
@@ -182,13 +199,18 @@
    * output blob.
    */
   void Read(string* key, string* value) const {
-    CHECK(cursor_ != nullptr) << "Reader not initialized.";
+    CAFFE_ENFORCE(cursor_ != nullptr, "Reader not initialized.");
     std::unique_lock<std::mutex> mutex_lock(reader_mutex_);
     *key = cursor_->key();
     *value = cursor_->value();
-    cursor_->Next();
-    if (!cursor_->Valid()) {
-      cursor_->SeekToFirst();
+
+    // In sharded mode, each read skips num_shards_ records
+    for (int s = 0; s < num_shards_; s++) {
+      cursor_->Next();
+      if (!cursor_->Valid()) {
+        MoveToBeginning();
+        break;
+      }
     }
   }
 
@@ -196,9 +218,9 @@
    * @brief Seeks to the first key. Thread safe.
    */
   void SeekToFirst() const {
-    CHECK(cursor_ != nullptr) << "Reader not initialized.";
+    CAFFE_ENFORCE(cursor_ != nullptr, "Reader not initialized.");
     std::unique_lock<std::mutex> mutex_lock(reader_mutex_);
-    cursor_->SeekToFirst();
+    MoveToBeginning();
   }
 
   /**
@@ -215,11 +237,24 @@
   }
 
  private:
+  void MoveToBeginning() const {
+    if (cursor_->SupportsSeek()) {
+      cursor_->SeekToFirst();
+    }
+    for (auto s = 0; s < shard_id_; s++) {
+      cursor_->Next();
+      CAFFE_ENFORCE(
+          cursor_->Valid(), "Db has less rows than shard id: ", s, shard_id_);
+    }
+  }
+
   string db_type_;
   string source_;
   unique_ptr<DB> db_;
   unique_ptr<Cursor> cursor_;
   mutable std::mutex reader_mutex_;
+  uint32_t num_shards_;
+  uint32_t shard_id_;
 
   DISABLE_COPY_AND_ASSIGN(DBReader);
 };
diff --git a/caffe2/core/init.cc b/caffe2/core/init.cc
index a73f0c1..bac6766 100644
--- a/caffe2/core/init.cc
+++ b/caffe2/core/init.cc
@@ -5,12 +5,20 @@
 #endif
 namespace caffe2 {
 
+namespace internal {
+Caffe2InitializeRegistry* Caffe2InitializeRegistry::Registry() {
+  static Caffe2InitializeRegistry gRegistry;
+  return &gRegistry;
+}
+}
+
 bool GlobalInit(int* pargc, char*** pargv) {
   static bool global_init_was_already_run = false;
   if (global_init_was_already_run) {
     VLOG(1) << "GlobalInit has already been called: did you double-call?";
     return true;
   }
+  global_init_was_already_run = true;
   bool success = true;
   success &= internal::Caffe2InitializeRegistry::Registry()
       ->RunRegisteredEarlyInitFunctions(pargc, pargv);
@@ -23,9 +31,11 @@
   // All other initialization functions.
   success &= internal::Caffe2InitializeRegistry::Registry()
       ->RunRegisteredInitFunctions(pargc, pargv);
+  if (!success) {
+    global_init_was_already_run = false;
+  }
   CAFFE_ENFORCE(success,
                 "Failed to run some init functions for caffe2.");
-  global_init_was_already_run = true;
   // TODO: if we fail GlobalInit(), should we continue?
   return success;
 }
diff --git a/caffe2/core/init.h b/caffe2/core/init.h
index ca7f979..b20866e 100644
--- a/caffe2/core/init.h
+++ b/caffe2/core/init.h
@@ -11,10 +11,9 @@
 class Caffe2InitializeRegistry {
  public:
   typedef bool (*InitFunction)(int*, char***);
-  static Caffe2InitializeRegistry* Registry() {
-    static Caffe2InitializeRegistry gRegistry;
-    return &gRegistry;
-  }
+  // Registry() is defined in .cpp file to make registration work across
+  // multiple shared libraries loaded with RTLD_LOCAL
+  static Caffe2InitializeRegistry* Registry();
 
   void Register(InitFunction function, bool run_early,
                 const char* description) {
diff --git a/caffe2/core/init_omp.cc b/caffe2/core/init_omp.cc
index 0f27c03..1b3be53 100644
--- a/caffe2/core/init_omp.cc
+++ b/caffe2/core/init_omp.cc
@@ -28,16 +28,15 @@
   if (!getenv("OMP_NUM_THREADS")) {
     // OMP_NUM_THREADS not passed explicitly, so *disable* OMP by
     // default. The user can use the CLI flag to override.
-    LOG(INFO) << "OMP_NUM_THREADS not passed, defaulting to 1 thread";
+    VLOG(1) << "OMP_NUM_THREADS not passed, defaulting to 1 thread";
     omp_set_num_threads(1);
   }
 
   if (FLAGS_caffe2_omp_num_threads > 0) {
-    LOG(INFO) << "Setting omp_num_threads to " << FLAGS_caffe2_omp_num_threads;
+    VLOG(1) << "Setting omp_num_threads to " << FLAGS_caffe2_omp_num_threads;
     omp_set_num_threads(FLAGS_caffe2_omp_num_threads);
   }
-  LOG(INFO) << "Caffe2 running with " << omp_get_max_threads()
-            << " OMP threads";
+  VLOG(1) << "Caffe2 running with " << omp_get_max_threads() << " OMP threads";
   return true;
 }
 REGISTER_CAFFE2_INIT_FUNCTION(Caffe2SetOpenMPThreads,
@@ -48,24 +47,23 @@
 #ifdef CAFFE2_USE_MKL
 bool Caffe2SetMKLThreads(int*, char***) {
   if (!getenv("MKL_NUM_THREADS")) {
-    LOG(INFO) << "MKL_NUM_THREADS not passed, defaulting to 1 thread";
+    VLOG(1) << "MKL_NUM_THREADS not passed, defaulting to 1 thread";
     mkl_set_num_threads(1);
   }
 
   // If caffe2_omp_num_threads is set, we use that for MKL as well.
   if (FLAGS_caffe2_omp_num_threads > 0) {
-    LOG(INFO) << "Setting mkl_num_threads to " << FLAGS_caffe2_omp_num_threads
-              << " as inherited from omp_num_threads.";
+    VLOG(1) << "Setting mkl_num_threads to " << FLAGS_caffe2_omp_num_threads
+            << " as inherited from omp_num_threads.";
     mkl_set_num_threads(FLAGS_caffe2_omp_num_threads);
   }
 
   // Override omp_num_threads if mkl_num_threads is set.
   if (FLAGS_caffe2_mkl_num_threads > 0) {
-    LOG(INFO) << "Setting mkl_num_threads to " << FLAGS_caffe2_mkl_num_threads;
+    VLOG(1) << "Setting mkl_num_threads to " << FLAGS_caffe2_mkl_num_threads;
     mkl_set_num_threads(FLAGS_caffe2_mkl_num_threads);
   }
-  LOG(INFO) << "Caffe2 running with " << mkl_get_max_threads()
-            << " MKL threads";
+  VLOG(1) << "Caffe2 running with " << mkl_get_max_threads() << " MKL threads";
   return true;
 }
 REGISTER_CAFFE2_INIT_FUNCTION(
diff --git a/caffe2/core/logging.cc b/caffe2/core/logging.cc
index 84a6028..522d8b5 100644
--- a/caffe2/core/logging.cc
+++ b/caffe2/core/logging.cc
@@ -23,8 +23,8 @@
 }
 
 size_t ReplaceAll(string& s, const char* from, const char* to) {
-  CHECK(from && *from);
-  CHECK(to);
+  CAFFE_ENFORCE(from && *from);
+  CAFFE_ENFORCE(to);
 
   size_t numReplaced = 0;
   string::size_type lenFrom = std::strlen(from);
@@ -37,12 +37,19 @@
   return numReplaced;
 }
 
+static std::function<string(void)> FetchStackTrace = []() { return ""; };
+
+void SetStackTraceFetcher(std::function<string(void)> fetcher) {
+  FetchStackTrace = fetcher;
+}
+
 EnforceNotMet::EnforceNotMet(
     const char* file,
     const int line,
     const char* condition,
     const string& msg)
     : msg_stack_{MakeString(
+          FetchStackTrace(),
           "[enforce fail at ",
           StripBasename(std::string(file)),
           ":",
@@ -50,22 +57,27 @@
           "] ",
           condition,
           ". ",
-          msg)} {
+          msg,
+          " ")} {
   if (FLAGS_caffe2_use_fatal_for_enforce) {
     LOG(FATAL) << msg_stack_[0];
-  } else {
-    LOG(ERROR) << msg_stack_[0];
   }
+  full_msg_ = this->msg();
 }
 
 void EnforceNotMet::AppendMessage(const string& msg) {
-  LOG(ERROR) << msg;
   msg_stack_.push_back(msg);
+  full_msg_ = this->msg();
 }
 
 string EnforceNotMet::msg() const {
   return std::accumulate(msg_stack_.begin(), msg_stack_.end(), string(""));
 }
+
+const char* EnforceNotMet::what() const noexcept {
+  return full_msg_.c_str();
+}
+
 }  // namespace caffe2
 
 
diff --git a/caffe2/core/logging.h b/caffe2/core/logging.h
index d617e9a..d1b0ee8 100644
--- a/caffe2/core/logging.h
+++ b/caffe2/core/logging.h
@@ -3,6 +3,8 @@
 
 #include <climits>
 #include <exception>
+#include <functional>
+#include <limits>
 #include <sstream>
 
 #include "caffe2/core/flags.h"
@@ -75,6 +77,8 @@
 // Returns number of replacements
 size_t ReplaceAll(string& s, const char* from, const char* to);
 
+void SetStackTraceFetcher(std::function<string(void)> fetcher);
+
 class EnforceNotMet : public std::exception {
  public:
   EnforceNotMet(
@@ -88,8 +92,11 @@
     return msg_stack_;
   }
 
+  const char* what() const noexcept override;
+
  private:
   vector<string> msg_stack_;
+  string full_msg_;
 };
 
 #define CAFFE_ENFORCE(condition, ...)                                         \
@@ -104,6 +111,124 @@
   throw ::caffe2::EnforceNotMet( \
       __FILE__, __LINE__, "", ::caffe2::MakeString(__VA_ARGS__))
 
+/**
+ * Rich logging messages
+ *
+ * CAFFE_ENFORCE_THAT can be used with one of the "checker functions" that
+ * capture input argument values and add it to the exception message. E.g.
+ * `CAFFE_ENFORCE_THAT(Equals(foo(x), bar(y)), "Optional additional message")`
+ * would evaluate both foo and bar only once and if the results are not equal -
+ * include them in the exception message.
+ *
+ * Some of the basic checker functions like Equals or Greater are already
+ * defined below. Other header might define customized checkers by adding
+ * functions to caffe2::enforce_detail namespace. For example:
+ *
+ *   namespace caffe2 { namespace enforce_detail {
+ *   inline EnforceFailMessage IsVector(const vector<TIndex>& shape) {
+ *     if (shape.size() == 1) { return EnforceOK(); }
+ *     return MakeString("Shape ", shape, " is not a vector");
+ *   }
+ *   }}
+ *
+ * With further usages like `CAFFE_ENFORCE_THAT(IsVector(Input(0).dims()))`
+ *
+ * Convenient wrappers for binary operations like CAFFE_ENFORCE_EQ are provided
+ * too. Please use them instead of CHECK_EQ and friends for failures in
+ * user-provided input.
+ */
+
+namespace enforce_detail {
+
+struct EnforceOK {};
+
+class EnforceFailMessage {
+ public:
+  constexpr /* implicit */ EnforceFailMessage(EnforceOK) : msg_(nullptr) {}
+
+  EnforceFailMessage(EnforceFailMessage&&) = default;
+  EnforceFailMessage(const EnforceFailMessage&) = delete;
+  EnforceFailMessage& operator=(EnforceFailMessage&&) = delete;
+  EnforceFailMessage& operator=(const EnforceFailMessage&) = delete;
+
+  // Catch all wrong usages like CAFFE_ENFORCE_THAT(x < y)
+  template <class... Args>
+  /* implicit */ EnforceFailMessage(Args...) {
+    static_assert(
+        // This stands for an "impossible" condition. Plain `false` doesn't
+        // trick compiler enough.
+        sizeof...(Args) == std::numeric_limits<std::size_t>::max(),
+        "CAFFE_ENFORCE_THAT has to be used with one of special check functions "
+        "like `Equals`. Use CAFFE_ENFORCE for simple boolean checks.");
+  }
+
+  /* implicit */ EnforceFailMessage(std::string&& msg) {
+    msg_ = new std::string(std::move(msg));
+  }
+  inline bool bad() const {
+    return msg_;
+  }
+  std::string get_message_and_free(std::string&& extra) const {
+    std::string r;
+    if (extra.empty()) {
+      r = std::move(*msg_);
+    } else {
+      r = ::caffe2::MakeString(std::move(*msg_), ". ", std::move(extra));
+    }
+    delete msg_;
+    return r;
+  }
+
+ private:
+  std::string* msg_;
+};
+
+#define BINARY_COMP_HELPER(name, op)                         \
+  template <typename T1, typename T2>                        \
+  inline EnforceFailMessage name(const T1& x, const T2& y) { \
+    if (x op y) {                                            \
+      return EnforceOK();                                    \
+    }                                                        \
+    return MakeString(x, " vs ", y);                         \
+  }
+BINARY_COMP_HELPER(Equals, ==)
+BINARY_COMP_HELPER(NotEquals, !=)
+BINARY_COMP_HELPER(Greater, >)
+BINARY_COMP_HELPER(GreaterEquals, >=)
+BINARY_COMP_HELPER(Less, <)
+BINARY_COMP_HELPER(LessEquals, <=)
+#undef BINARY_COMP_HELPER
+
+#define CAFFE_ENFORCE_THAT_IMPL(condition, expr, ...)                 \
+  do {                                                                \
+    using namespace ::caffe2::enforce_detail;                         \
+    const EnforceFailMessage& r = (condition);                        \
+    if (r.bad()) {                                                    \
+      throw ::caffe2::EnforceNotMet(                                  \
+          __FILE__,                                                   \
+          __LINE__,                                                   \
+          expr,                                                       \
+          r.get_message_and_free(::caffe2::MakeString(__VA_ARGS__))); \
+    }                                                                 \
+  } while (false)
+}
+
+#define CAFFE_ENFORCE_THAT(condition, ...) \
+  CAFFE_ENFORCE_THAT_IMPL((condition), #condition, __VA_ARGS__)
+
+#define CAFFE_ENFORCE_EQ(x, y, ...) \
+  CAFFE_ENFORCE_THAT_IMPL(Equals((x), (y)), #x " == " #y, __VA_ARGS__)
+#define CAFFE_ENFORCE_NE(x, y, ...) \
+  CAFFE_ENFORCE_THAT_IMPL(NotEquals((x), (y)), #x " != " #y, __VA_ARGS__)
+#define CAFFE_ENFORCE_LE(x, y, ...) \
+  CAFFE_ENFORCE_THAT_IMPL(LessEquals((x), (y)), #x " <= " #y, __VA_ARGS__)
+#define CAFFE_ENFORCE_LT(x, y, ...) \
+  CAFFE_ENFORCE_THAT_IMPL(Less((x), (y)), #x " < " #y, __VA_ARGS__)
+#define CAFFE_ENFORCE_GE(x, y, ...) \
+  CAFFE_ENFORCE_THAT_IMPL(GreaterEquals((x), (y)), #x " >= " #y, __VA_ARGS__)
+#define CAFFE_ENFORCE_GT(x, y, ...) \
+  CAFFE_ENFORCE_THAT_IMPL(Greater((x), (y)), #x " > " #y, __VA_ARGS__)
+
 } // namespace caffe2
 
 #endif // CAFFE2_CORE_LOGGING_H_
diff --git a/caffe2/core/logging_test.cc b/caffe2/core/logging_test.cc
index a8d494e..cce709e 100644
--- a/caffe2/core/logging_test.cc
+++ b/caffe2/core/logging_test.cc
@@ -17,16 +17,58 @@
     CAFFE_ENFORCE(false, "This throws.");
     // This should never be triggered.
     EXPECT_FALSE(true);
-  } catch (const EnforceNotMet& err) {}
+  } catch (const EnforceNotMet& err) {
+  }
   std::swap(FLAGS_caffe2_use_fatal_for_enforce, kFalse);
 }
 
+TEST(LoggingTest, TestEnforceEquals) {
+  int x = 4;
+  int y = 5;
+  try {
+    CAFFE_ENFORCE_THAT(Equals(++x, ++y));
+    // This should never be triggered.
+    EXPECT_FALSE(true);
+  } catch (const EnforceNotMet& err) {
+    EXPECT_NE(err.msg().find("5 vs 6"), string::npos);
+  }
+
+  // arguments are expanded only once
+  CAFFE_ENFORCE_THAT(Equals(++x, y));
+  EXPECT_EQ(x, 6);
+  EXPECT_EQ(y, 6);
+}
+
+TEST(LoggingTest, EnforceShowcase) {
+  // It's not really a test but rather a convenient thing that you can run and
+  // see all messages
+  int one = 1;
+  int two = 2;
+  int three = 3;
+#define WRAP_AND_PRINT(exp)                     \
+  try {                                         \
+    exp;                                        \
+  } catch (const EnforceNotMet& err) {          \
+    /* EnforceNotMet already does LOG(ERROR) */ \
+  }
+  WRAP_AND_PRINT(CAFFE_ENFORCE_EQ(one, two));
+  WRAP_AND_PRINT(CAFFE_ENFORCE_NE(one * 2, two));
+  WRAP_AND_PRINT(CAFFE_ENFORCE_GT(one, two));
+  WRAP_AND_PRINT(CAFFE_ENFORCE_GE(one, two));
+  WRAP_AND_PRINT(CAFFE_ENFORCE_LT(three, two));
+  WRAP_AND_PRINT(CAFFE_ENFORCE_LE(three, two));
+
+  WRAP_AND_PRINT(CAFFE_ENFORCE_EQ(
+      one * two + three, three * two, "It's a pretty complicated expression"));
+
+  WRAP_AND_PRINT(CAFFE_ENFORCE_THAT(Equals(one * two + three, three * two)));
+}
+
 TEST(LoggingDeathTest, TestEnforceUsingFatal) {
   bool kTrue = true;
   std::swap(FLAGS_caffe2_use_fatal_for_enforce, kTrue);
-  EXPECT_DEATH(
-      CAFFE_ENFORCE(false, "This goes fatal."), "");
+  EXPECT_DEATH(CAFFE_ENFORCE(false, "This goes fatal."), "");
   std::swap(FLAGS_caffe2_use_fatal_for_enforce, kTrue);
 }
 
-}  // namespace caffe2
+} // namespace caffe2
diff --git a/caffe2/core/net.cc b/caffe2/core/net.cc
index 52534e7..fabfc45 100644
--- a/caffe2/core/net.cc
+++ b/caffe2/core/net.cc
@@ -1,6 +1,8 @@
 #include "caffe2/core/net.h"
 
 #include <set>
+#include <stack>
+#include <unordered_map>
 #include <unordered_set>
 
 #include "caffe2/core/operator.h"
@@ -9,7 +11,7 @@
 
 CAFFE2_DEFINE_bool(
     caffe2_disable_chaining,
-    true,
+    false,
     "Disable chaining logic (some latent multi-device issues).");
 
 namespace caffe2 {
@@ -23,25 +25,6 @@
 }
 
 using OpIndex = int;
-using Ancestry = std::vector<std::unordered_set<OpIndex>>;
-Ancestry computeAncestors(
-    const std::vector<internal::OperatorNode>& ops) {
-  Ancestry ancestors;
-  ancestors.resize(ops.size());
-  for (auto i = 0; i < ops.size(); ++i) {
-    const auto& parents = ops[i].parents_;
-    for (const auto parent : parents) {
-      ancestors[i].insert(parent);
-      for (const auto parent_ancestor : ancestors[parent]) {
-        ancestors[i].insert(parent_ancestor);
-      }
-    }
-    VLOG(2) << "Ancestors of op: " << i << ", "
-            << std::vector<OpIndex>(ancestors[i].begin(), ancestors[i].end());
-  }
-  return ancestors;
-}
-
 DAGNetBase::ExecutionChains singleChains(
     const std::vector<internal::OperatorNode>& nodes) {
   DAGNetBase::ExecutionChains chains;
@@ -53,74 +36,136 @@
 
 DAGNetBase::ExecutionChains computeChains(
     const std::vector<internal::OperatorNode>& nodes) {
-  const auto& ancestry = computeAncestors(nodes);
+  vector<int> initial_frontier;
+  for (int idx = 0; idx < nodes.size(); ++idx) {
+    if (nodes[idx].parents_.size() == 0) {
+      initial_frontier.push_back(idx);
+    }
+  }
 
+  // We need to construct the node_seen_count to know how many inner edges each
+  // node has.
+  std::unordered_map<OpIndex, int> node_seen_count;
+
+  for (int root_index : initial_frontier) {
+    const auto& root = nodes[root_index];
+    std::stack<std::pair<OpIndex, std::vector<int>::const_iterator>>
+        depth_stack;
+    depth_stack.push(make_pair(root_index, root.children_.begin()));
+    node_seen_count[root_index]++;
+    CAFFE_ENFORCE(
+        node_seen_count[root_index] == 1,
+        "root node ",
+        root_index,
+        " visit count must be == 1");
+
+    while (depth_stack.size() > 0) {
+      auto cur = depth_stack.top();
+      depth_stack.pop();
+      if (cur.second != nodes[cur.first].children_.end()) {
+        OpIndex node_index = *cur.second;
+        node_seen_count[node_index]++;
+        cur.second++;
+        depth_stack.push(cur);
+        if (node_seen_count[node_index] == 1) {
+          // Visit each child only once.
+          depth_stack.push(
+              make_pair(node_index, nodes[node_index].children_.begin()));
+        }
+      }
+    }
+  }
   // Now, we compute the set of execution chains An execution chain is
   // a linear set of nodes that can be executed on a single stream
   // (e.g. a chain of single input, single output operators)
   DAGNetBase::ExecutionChains chains;
   std::unordered_set<OpIndex> seen_nodes;
-  for (auto i = 0; i < nodes.size(); ++i) {
-    if (seen_nodes.find(i) != seen_nodes.end()) {
-      // We've already executed this operator.
-      continue;
-    }
-    // Compute the execution chain rooted at this node.
-    std::vector<OpIndex> chain;
-    chain.push_back(i);
-
-    while (true) {
-      const auto current = chain.back();
-      const auto& children = nodes[current].children_;
-
-      // Find children for which this current node is the *single*
-      // direct ancestor. If there are more than one, then we can't
-      // chain.
-      std::vector<OpIndex> candidates;
-      for (const auto child : children) {
-        std::vector<OpIndex> direct_parents;
-        const auto& parents = nodes[child].parents_;
-        for (const auto parent : parents) {
-          if (std::all_of(
-                  parents.begin(), parents.end(), [&](OpIndex other_parent) {
-                    // If `other_parent` contains `parent` in it's
-                    // ancestors, we can ignore `parent`.
-                    return !ancestry.at(other_parent).count(parent);
-                  })) {
-            direct_parents.push_back(parent);
-          }
-        }
-        if (direct_parents.size() == 1 && direct_parents.front() == current) {
-          candidates.push_back(child);
-        }
-      }
-
-      if (candidates.size() != 1) {
-        break;
-      }
-
-      const auto candidate = candidates.front();
-      const auto parent = chain.back();
-
-      if (!sameDevice(
-              nodes[candidate].operator_->def(),
-              nodes[parent].operator_->def())) {
-        break;
-      }
-
-      chain.push_back(candidate);
-    };
-
-    for (const auto node : chain) {
+  std::vector<OpIndex> chain;
+  std::pair<OpIndex, std::vector<int>::const_iterator> cur;
+  std::stack<std::pair<OpIndex, std::vector<int>::const_iterator>> depth_stack;
+  auto check_current_for_chaining = [&]() -> bool {
+    return (
+        node_seen_count[cur.first] == 1 &&
+        (chain.size() == 0 || sameDevice(
+                                  nodes[cur.first].operator_->def(),
+                                  nodes[chain.back()].operator_->def())));
+  };
+  auto commit_chain = [&]() {
+    if (chain.size() > 0) {
       CAFFE_ENFORCE(
-          seen_nodes.insert(node).second,
-          "Node ",
-          node,
-          " is already in the net.");
+          chains.insert({chain.front(), chain}).second,
+          "Chain ",
+          chain.front(),
+          " was already added.");
+      VLOG(2) << "Added chain: " << chain.front() << "with elements";
+      for (auto ch : chain) {
+        VLOG(2) << ch << ", ";
+      }
+      chain.clear();
     }
-    CAFFE_ENFORCE(
-        chains.insert({i, chain}).second, "Chain ", i, " was already added.");
-    VLOG(2) << "Added chain: " << chain;
+  };
+  auto depth_traverse = [&]() {
+    while (cur.second != nodes[cur.first].children_.end() &&
+           seen_nodes.find(*cur.second) != seen_nodes.end()) {
+      cur.second++;
+    }
+
+    if (cur.second != nodes[cur.first].children_.end()) {
+      auto next = make_pair(*cur.second, nodes[*cur.second].children_.begin());
+      depth_stack.push(cur);
+      depth_stack.push(next);
+    }
+  };
+  for (int root_index : initial_frontier) {
+    depth_stack.push(
+        make_pair(root_index, nodes[root_index].children_.begin()));
+    while (depth_stack.size() > 0) {
+      cur = depth_stack.top();
+      depth_stack.pop();
+      if (seen_nodes.find(cur.first) == seen_nodes.end()) {
+        seen_nodes.insert(cur.first);
+        // Has one child, can be candidate for chain or can be added to the
+        // previous chain.
+        if (nodes[cur.first].children_.size() == 1) {
+          if (check_current_for_chaining()) {
+            // Add oneself to the current chain.
+            VLOG(1) << "Adding to existing chain" << cur.first;
+            chain.push_back(cur.first);
+            int index = *nodes[cur.first].children_.begin();
+            depth_stack.push(make_pair(index, nodes[index].children_.begin()));
+          } else {
+            // Can't belong to the previous chain, commit previous chain and
+            // start a new one.
+            commit_chain();
+            chain.push_back(cur.first);
+            int index = *nodes[cur.first].children_.begin();
+            depth_stack.push(make_pair(index, nodes[index].children_.begin()));
+          }
+        } else if (
+            nodes[cur.first].children_.size() == 0 &&
+            check_current_for_chaining()) {
+          // Add current node to the current chain and commit.
+          chain.push_back(cur.first);
+          commit_chain();
+        } else {
+          // Node has more than one child.
+          commit_chain();
+          // Add current node as an independent chain since it won't be a part
+          // of a bigger chain.
+          chain.push_back(cur.first);
+          commit_chain();
+          depth_traverse();
+        }
+      } else {
+        // This node has been seen before, we will only traverse its children.
+        // Commit any pending chains and continue traversing.
+        commit_chain();
+        depth_traverse();
+      }
+    } // End while
+
+    // Check if this if is even needed.
+    commit_chain();
   }
   CAFFE_ENFORCE(
       seen_nodes.size() == nodes.size(),
@@ -131,7 +176,6 @@
       ".");
   return chains;
 }
-
 }
 
 CAFFE_DEFINE_REGISTRY(NetRegistry, NetBase, const NetDef&, Workspace*);
@@ -150,12 +194,19 @@
     for (const string& in : op.input()) {
       if (!known_blobs.count(in)) {
         if (external_input_.size()) {
-          CAFFE_ENFORCE(false,
-                        "Source for input ", in, " is unknown.");
+          CAFFE_THROW(
+              "op ",
+              op.type(),
+              ": Source for input ",
+              in,
+              " is unknown for net ",
+              def.name(),
+              ", operator ",
+              ProtoDebugString(op));
         } else {
           // If we are not declaring input and output, we will simply VLOG it
           // for debugging purposes.
-          VLOG(1) << "Source for input " << in << " is unknown.";
+          VLOG(1) << "op " << op.type() << ": input " << in << " is unknown.";
         }
       }
     }
@@ -168,7 +219,10 @@
   CAFFE_ENFORCE(
       remaining_output.size() == 0,
       "Some of the blobs are declared as output but never produced by the "
-      "net.");
+      "net ",
+      def.name(),
+      ", the first one is ",
+      *remaining_output.begin());
 }
 
 unique_ptr<NetBase> CreateNet(const NetDef& net_def, Workspace* ws) {
@@ -182,6 +236,7 @@
 
 SimpleNet::SimpleNet(const NetDef& net_def, Workspace* ws)
     : NetBase(net_def, ws) {
+  VLOG(1) << "Constructing SimpleNet " << net_def.name();
   bool net_def_has_device_option = net_def.has_device_option();
   // Initialize the operators
   for (const OperatorDef& operator_def : net_def.op()) {
@@ -317,6 +372,7 @@
 DAGNetBase::DAGNetBase(const NetDef& net_def, Workspace* ws)
     : NetBase(net_def, ws), operator_nodes_(net_def.op_size()) {
   // Blob creator allows us to track which operator created which blob.
+  VLOG(1) << "Constructing DAGNet " << net_def.name();
   std::map<string, int> blob_creator;
   std::map<string, std::set<int> > blob_readers;
   bool net_def_has_device_option = net_def.has_device_option();
@@ -411,6 +467,9 @@
       (FLAGS_caffe2_disable_chaining ? singleChains(operator_nodes_)
                                      : computeChains(operator_nodes_));
 
+  LOG(INFO) << "Number of parallel execution chains "
+            << execution_chains_.size()
+            << " Number of operators = " << net_def.op_size();
   // TODO: do we want to make sure that there are no loops in the
   // dependency graph?
 
diff --git a/caffe2/core/net_gpu.cc b/caffe2/core/net_gpu.cc
index bd41dbb..fa3c6e3 100644
--- a/caffe2/core/net_gpu.cc
+++ b/caffe2/core/net_gpu.cc
@@ -163,6 +163,7 @@
 class AsyncDAGNet : public DAGNetBase {
  public:
   AsyncDAGNet(const NetDef& net_def, Workspace* ws) : DAGNetBase(net_def, ws) {
+    VLOG(1) << "Constructing Async DAG Net " << net_def.name();
     eventRecorded_.resize(net_def.op_size());
     events_.reserve(net_def.op_size());
     for (int idx = 0; idx < net_def.op_size(); ++idx) {
diff --git a/caffe2/core/net_test.cc b/caffe2/core/net_test.cc
index ae6f3a4..f9e7854 100644
--- a/caffe2/core/net_test.cc
+++ b/caffe2/core/net_test.cc
@@ -44,8 +44,8 @@
     const vector<string>& input,
     const vector<string>& output) {
   NetDef net_def;
-  CHECK(google::protobuf::TextFormat::ParseFromString(
-    kExampleNetDefString, &net_def));
+  CAFFE_ENFORCE(google::protobuf::TextFormat::ParseFromString(
+      kExampleNetDefString, &net_def));
   for (const auto& name : input) {
     net_def.add_external_input(name);
   }
@@ -105,7 +105,7 @@
   Workspace ws;
   ws.CreateBlob("in");
   NetDef net_def;
-  CHECK(google::protobuf::TextFormat::ParseFromString(spec, &net_def));
+  CAFFE_ENFORCE(google::protobuf::TextFormat::ParseFromString(spec, &net_def));
   {
     auto old = FLAGS_caffe2_disable_chaining;
     auto g = MakeGuard([&]() { FLAGS_caffe2_disable_chaining = old; });
@@ -201,34 +201,34 @@
   checkChaining(spec, {{0, {0}}, {1, {1}}, {2, {2}}});
 }
 
-TEST(NetTest, ChainingForJoinWithAncestor) {
-  const auto spec = R"DOC(
-        name: "example"
-        type: "dag"
-        external_input: "in"
-        op {
-          input: "in"
-          output: "hidden"
-          type: "NetTestDummy"
-        }
-        op {
-          input: "hidden"
-          output: "out1"
-          type: "NetTestDummy"
-        }
-        op {
-          input: "hidden"
-          output: "out2"
-          type: "NetTestDummy"
-        }
-        op {
-          input: "hidden"
-          input: "out2"
-          type: "NetTestDummy"
-        }
-)DOC";
-  checkChaining(spec, {{0, {0}}, {1, {1}}, {2, {2, 3}}});
-}
+// TEST(NetTest, ChainingForJoinWithAncestor) {
+//   const auto spec = R"DOC(
+//         name: "example"
+//         type: "dag"
+//         external_input: "in"
+//         op {
+//           input: "in"
+//           output: "hidden"
+//           type: "NetTestDummy"
+//         }
+//         op {
+//           input: "hidden"
+//           output: "out1"
+//           type: "NetTestDummy"
+//         }
+//         op {
+//           input: "hidden"
+//           output: "out2"
+//           type: "NetTestDummy"
+//         }
+//         op {
+//           input: "hidden"
+//           input: "out2"
+//           type: "NetTestDummy"
+//         }
+// )DOC";
+//   checkChaining(spec, {{0, {0}}, {1, {1}}, {2, {2, 3}}});
+// }
 
 TEST(NetTest, ChainingForForkJoin) {
   const auto spec = R"DOC(
diff --git a/caffe2/core/operator.cc b/caffe2/core/operator.cc
index 79a2a97..387d87a 100644
--- a/caffe2/core/operator.cc
+++ b/caffe2/core/operator.cc
@@ -8,6 +8,7 @@
 #include "caffe2/core/workspace.h"
 #include "caffe2/proto/caffe2.pb.h"
 #include "caffe2/utils/proto_utils.h"
+#include "caffe2/utils/string_utils.h"
 
 namespace caffe2 {
 
@@ -16,8 +17,12 @@
     : operator_def_(operator_def), arg_helper_(operator_def_) {
   for (const string& input_str : operator_def_.input()) {
     auto* blob = ws->GetBlob(input_str);
-    CAFFE_ENFORCE(blob != nullptr,
-                  "Encountered a non-existing input blob: ", input_str);
+    CAFFE_ENFORCE(
+        blob != nullptr,
+        "op ",
+        operator_def_.type(),
+        ": Encountered a non-existing input blob: ",
+        input_str);
     inputs_.push_back(blob);
   }
   for (const string& output_str : operator_def_.output()) {
@@ -28,16 +33,23 @@
 namespace {
 unique_ptr<OperatorBase> TryCreateOperator(
     const string& key, const OperatorDef& operator_def, Workspace* ws) {
-  switch (operator_def.device_option().device_type()) {
-  case CPU:
-    VLOG(1) << "Creating CPU operator " << key;
-    return CPUOperatorRegistry()->Create(key, operator_def, ws);
-  case CUDA:
-    VLOG(1) << "Creating CUDA operator " << key;
-    return CUDAOperatorRegistry()->Create(key, operator_def, ws);
-  default:
-    LOG(FATAL) << "Unknown device type: "
-                << operator_def.device_option().device_type();
+  try {
+    switch (operator_def.device_option().device_type()) {
+      case CPU:
+        VLOG(1) << "Creating CPU operator " << key;
+        return CPUOperatorRegistry()->Create(key, operator_def, ws);
+      case CUDA:
+        VLOG(1) << "Creating CUDA operator " << key;
+        return CUDAOperatorRegistry()->Create(key, operator_def, ws);
+      default:
+        LOG(FATAL) << "Unknown device type: "
+                   << operator_def.device_option().device_type();
+        return nullptr;
+    }
+  } catch (const UnsupportedOperatorFeature& err) {
+    VLOG(1) << "Operator " << operator_def.type()
+            << " with engine does not support the requested feature. Msg: "
+            << err.what() << ". Proto is: " << ProtoDebugString(operator_def);
     return nullptr;
   }
 }
@@ -63,17 +75,21 @@
 
   // Second, if the user has provided an engine, try create that engine
   if (operator_def.engine().size()) {
-    string key = operator_def.type() +  "_ENGINE_" + operator_def.engine();
-    VLOG(1) << "Trying to create operator " << operator_def.type()
-            << " with engine " << operator_def.engine();
-    auto op = TryCreateOperator(key, operator_def, ws);
-    if (op) {
-      return op;
+    vector<string> engine_choices = split(',', operator_def.engine());
+    for (const string& engine : engine_choices) {
+      string key = operator_def.type() + "_ENGINE_" + engine;
+      VLOG(1) << "Trying to create operator " << operator_def.type()
+              << " with engine " << engine;
+      auto op = TryCreateOperator(key, operator_def, ws);
+      if (op) {
+        return op;
+      } else {
+        // If the above fails, we will just return the normal case with the
+        // default implementation.
+        VLOG(1) << "Operator with engine " << engine
+                << " is not available. Using default implementation.";
+      }
     }
-    // If the above fails, we will just return the normal case with the default
-    // implementation.
-    VLOG(1) << "Operator with engine " << operator_def.engine()
-            << " is not available. Using default implementation.";
   }
 
   // Lastly, if the engine does not work here, try using the default engine.
@@ -142,10 +158,11 @@
     } else if (grad.IsDense()) {
       VLOG(1) << "\t [dense]" << grad.dense_;
     } else {
-      CHECK(grad.indices_.size() && grad.values_.size())
-          << "For sparse gradient, one should set both indices and values. "
-          << "Currently we have: (" << grad.indices_ << ", " << grad.values_
-          << ").";
+      CAFFE_ENFORCE(
+          grad.indices_.size() && grad.values_.size(),
+          "For sparse gradient, one should set both indices and values. "
+          "Currently we have: (" +
+              grad.indices_ + ", " + grad.values_ + ").");
       VLOG(1) << "\t [sparse] " << grad.indices_ << ", " << grad.values_;
     }
   }
diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h
index 4df4e10..0f5fcac 100644
--- a/caffe2/core/operator.h
+++ b/caffe2/core/operator.h
@@ -3,6 +3,7 @@
 
 #include <climits>
 #include <cstddef>
+#include <exception>
 #include <typeinfo>
 #include <vector>
 
@@ -162,7 +163,7 @@
       }
       return (started && finished);
     } catch (EnforceNotMet& err) {
-      err.AppendMessage("Error from operator " + ProtoDebugString(def()));
+      err.AppendMessage("Error from operator: \n" + ProtoDebugString(def()));
       throw;
     }
   }
@@ -172,7 +173,7 @@
       context_.SwitchToDevice();
       return RunOnDevice();
     } catch (EnforceNotMet& err) {
-      err.AppendMessage("Error from operator " + ProtoDebugString(def()));
+      err.AppendMessage("Error from operator: \n" + ProtoDebugString(def()));
       throw;
     }
   }
@@ -339,6 +340,30 @@
 #define REGISTER_CUDNN_OPERATOR(name, ...) \
   REGISTER_CUDA_OPERATOR_WITH_ENGINE(name, CUDNN, __VA_ARGS__)
 
+// An exception that can be thrown by an operator constructor that notifies
+// that it does not support the given setting. This can be usually used for
+// specific engines that only implement a subset of the features required by
+// the original operator schema.
+// TODO(jiayq): make more feature-complete exception message.
+class UnsupportedOperatorFeature : public std::exception {
+ public:
+  UnsupportedOperatorFeature(const string& msg) : msg_(msg) {}
+  const char* what() const noexcept override {
+    return msg_.c_str();
+  }
+
+ private:
+  string msg_;
+};
+
+// A helper macro that should ONLY be used in the operator constructor to check
+// if needed features are met. If not, throws the UnsupportedOperatorFeature
+// exception with the given message.
+#define OPERATOR_NEEDS_FEATURE(condition, message) \
+  if (!(condition)) {                              \
+    throw UnsupportedOperatorFeature(message);     \
+  }
+
 // Creates an operator with the given operator definition.
 unique_ptr<OperatorBase> CreateOperator(
     const OperatorDef& operator_def, Workspace* ws);
diff --git a/caffe2/core/operator_schema_test.cc b/caffe2/core/operator_schema_test.cc
index 5da2bfe..7fbc95b 100644
--- a/caffe2/core/operator_schema_test.cc
+++ b/caffe2/core/operator_schema_test.cc
@@ -190,6 +190,10 @@
   // deduces the
   // schema from the "to" argument.
   const OpSchema* schema = OpSchemaRegistry::Schema("Cast");
+  if (!schema) {
+    // Compiled without the Cast op.
+    return;
+  }
   OperatorDef def = CreateOperatorDef(
       "Cast",
       "",
diff --git a/caffe2/core/operator_test.cc b/caffe2/core/operator_test.cc
index 98c2de1..bd875d6 100644
--- a/caffe2/core/operator_test.cc
+++ b/caffe2/core/operator_test.cc
@@ -13,6 +13,34 @@
  public:
   using OperatorBase::OperatorBase;
   bool Run() override { return true; }
+  virtual string type() {
+    return "base";
+  }
+};
+
+class JustTestAndNeverConstructs : public JustTest {
+ public:
+  JustTestAndNeverConstructs(const OperatorDef& def, Workspace* ws)
+      : JustTest(def, ws) {
+    throw UnsupportedOperatorFeature("I just don't construct.");
+  }
+  bool Run() override {
+    return true;
+  }
+  string type() override {
+    return "FOO";
+  }
+};
+
+class JustTestAndDoesConstruct : public JustTest {
+ public:
+  using JustTest::JustTest;
+  bool Run() override {
+    return true;
+  }
+  string type() override {
+    return "BAR";
+  }
 };
 
 class ThrowException : public Operator<CPUContext> {
@@ -28,6 +56,8 @@
 OPERATOR_SCHEMA(ThrowException).NumInputs(0).NumOutputs(0);
 
 REGISTER_CPU_OPERATOR(JustTest, JustTest);
+REGISTER_CPU_OPERATOR_WITH_ENGINE(JustTest, FOO, JustTestAndNeverConstructs);
+REGISTER_CPU_OPERATOR_WITH_ENGINE(JustTest, BAR, JustTestAndDoesConstruct);
 REGISTER_CUDA_OPERATOR(JustTest, JustTest);
 REGISTER_CPU_OPERATOR(ThrowException, ThrowException);
 
@@ -65,6 +95,26 @@
   }
 }
 
+TEST(OperatorTest, FallbackIfEngineDoesNotBuild) {
+  OperatorDef op_def;
+  Workspace ws;
+  op_def.set_type("JustTest");
+  op_def.set_engine("FOO");
+  unique_ptr<OperatorBase> op = CreateOperator(op_def, &ws);
+  EXPECT_NE(nullptr, op.get());
+  EXPECT_EQ(static_cast<JustTest*>(op.get())->type(), "base");
+}
+
+TEST(OperatorTest, MultipleEngineChoices) {
+  OperatorDef op_def;
+  Workspace ws;
+  op_def.set_type("JustTest");
+  op_def.set_engine("FOO,BAR");
+  unique_ptr<OperatorBase> op = CreateOperator(op_def, &ws);
+  EXPECT_NE(nullptr, op.get());
+  EXPECT_EQ(static_cast<JustTest*>(op.get())->type(), "BAR");
+}
+
 TEST(OperatorTest, CannotUseUninitializedBlob) {
   Workspace ws;
   OperatorDef op_def;
diff --git a/caffe2/core/parallel_net_test.cc b/caffe2/core/parallel_net_test.cc
index e218063..7332be1 100644
--- a/caffe2/core/parallel_net_test.cc
+++ b/caffe2/core/parallel_net_test.cc
@@ -89,14 +89,14 @@
 // Run a network and get its duration in milliseconds.
 int RunNetAndGetDuration(const string& net_def_str, const string& type) {
   NetDef net_def;
-  CHECK(google::protobuf::TextFormat::ParseFromString(
-    net_def_str, &net_def));
+  CAFFE_ENFORCE(
+      google::protobuf::TextFormat::ParseFromString(net_def_str, &net_def));
   net_def.set_type(type);
   Workspace ws;
   unique_ptr<NetBase> net(CreateNet(net_def, &ws));
-  CHECK(net.get() != nullptr);
+  CAFFE_ENFORCE(net.get() != nullptr);
   auto start_time = std::chrono::system_clock::now();
-  CHECK(net->Run());
+  CAFFE_ENFORCE(net->Run());
   // Inspect the time - it should be around 200 milliseconds, since sleep3 can
   // run in parallel with sleep1 and sleep2.
   auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(
diff --git a/caffe2/core/predictor.cc b/caffe2/core/predictor.cc
index 7d7d2d1..47d9f08 100644
--- a/caffe2/core/predictor.cc
+++ b/caffe2/core/predictor.cc
@@ -31,8 +31,11 @@
 }
 }
 
-Predictor::Predictor(const NetDef& init_net, const NetDef& run_net)
-    : run_net_(run_net) {
+Predictor::Predictor(
+    const NetDef& init_net,
+    const NetDef& run_net,
+    Workspace* parent)
+    : run_net_(run_net), ws_(parent) {
   CAFFE_ENFORCE(ws_.RunNetOnce(init_net));
   CAFFE_ENFORCE(ws_.CreateNet(run_net));
 }
diff --git a/caffe2/core/predictor.h b/caffe2/core/predictor.h
index 687c133..7767ece 100644
--- a/caffe2/core/predictor.h
+++ b/caffe2/core/predictor.h
@@ -10,7 +10,10 @@
   using TensorVector = std::vector<TensorCPU*>;
   // Runs the `init_net` once, then saves the `run_net` to be executed
   // in `::run`
-  Predictor(const NetDef& init_net, const NetDef& run_net);
+  Predictor(
+      const NetDef& init_net,
+      const NetDef& run_net,
+      Workspace* parent = nullptr);
 
   // Executes `run_net` on the inputs.
   // The first `inputs.size()` inputs from run_net::external_inputs
diff --git a/caffe2/core/registry_test.cc b/caffe2/core/registry_test.cc
index 2f879c0c..6683d95 100644
--- a/caffe2/core/registry_test.cc
+++ b/caffe2/core/registry_test.cc
@@ -6,7 +6,8 @@
 #include "caffe2/core/logging.h"
 
 namespace caffe2 {
-namespace registry_test {
+namespace {
+
 class Foo {
  public:
   explicit Foo(int x) { LOG(INFO) << "Foo " << x; }
@@ -41,8 +42,5 @@
 TEST(RegistryTest, ReturnNullOnNonExistingCreator) {
   EXPECT_EQ(FooRegistry()->Create("Non-existing bar", 1), nullptr);
 }
-
-} // registry_test
+}
 }  // namespace caffe2
-
-
diff --git a/caffe2/core/tensor.cc b/caffe2/core/tensor.cc
index 7d0a358..4355a1e 100644
--- a/caffe2/core/tensor.cc
+++ b/caffe2/core/tensor.cc
@@ -5,3 +5,8 @@
     caffe2_keep_on_shrink,
     true,
     "If set, keeps memory when a tensor is shrinking its size.");
+
+namespace caffe2 {
+// declaring it here instead of context.cc because tensor.h includes context.h
+CAFFE_KNOWN_TYPE(Tensor<CPUContext>);
+}
diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
index 6886152..b25336e 100644
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@@ -296,7 +296,7 @@
     // It is possible that the source tensor hasn't called mutable_data() yet,
     // in which case ShareData() doesn't make much sense since we don't really
     // know what to share yet.
-    CHECK(src.data_.get()) << "Source tensor has no content yet.";
+    CAFFE_ENFORCE(src.data_.get(), "Source tensor has no content yet.");
     // Finally, do sharing.
     data_ = src.data_;
     capacity_ = src.capacity_;
@@ -313,8 +313,9 @@
   template <typename T>
   void ShareExternalPointer(T* src, size_t capacity = 0) {
     meta_ = TypeMeta::Make<T>();
-    CHECK(size_ > 0)
-        << "To share data with a raw pointer, you need to set shape first.";
+    CAFFE_ENFORCE(
+        size_ > 0,
+        "To share data with a raw pointer, you need to set shape first.");
     data_.reset(src, [](void*)->void {});
     // Sets capacity. If not specified, we will implicitly assume that
     // the capacity is the current size.
@@ -344,8 +345,9 @@
   inline const T* data() const {
     CAFFE_ENFORCE(
         data_.get() || size_ == 0,
-        "The tensor is uninitialized. You probably need to call ",
-        "Resize() and mutable_data() first.");
+        "The tensor is of non-zero shape, but its data is not allocated yet. "
+        "Caffe2 uses a lazy allocation, so you will need to call "
+        "mutable_data() or raw_mutable_data() to actually allocate memory.");
     CAFFE_ENFORCE(
         IsType<T>(),
         "Tensor type mistmatch, caller expects elements to be ",
@@ -467,7 +469,7 @@
 
   // Product of all dims up to
   inline TIndex size_to_dim(int k) const {
-    CHECK(k < dims_.size());
+    CAFFE_ENFORCE(k < dims_.size());
     TIndex r = 1;
     for (int i = 0; i < k; ++i) {
       r *= dims_[i];
@@ -544,11 +546,12 @@
   bool SetDims(const vector<T>& src) {
     auto old_size = size_;
     dims_.resize(src.size());
-    size_ = 1;
+    TIndex new_size = 1;
     for (int i = 0; i < src.size(); ++i) {
-      size_ *= src[i];
+      new_size *= src[i];
       dims_[i] = src[i];
     }
+    size_ = new_size;
     return size_ != old_size;
   }
 
diff --git a/caffe2/core/typeid.cc b/caffe2/core/typeid.cc
index 5b6352b..c7163a4 100644
--- a/caffe2/core/typeid.cc
+++ b/caffe2/core/typeid.cc
@@ -35,5 +35,5 @@
 };
 static UninitializedTypeNameRegisterer g_uninitialized_type_name_registerer;
 
-}  // namespace
-}  // namespace caffe2
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/core/typeid.h b/caffe2/core/typeid.h
index c737b41..4d68c7a 100644
--- a/caffe2/core/typeid.h
+++ b/caffe2/core/typeid.h
@@ -29,10 +29,10 @@
 
 template <typename T>
 struct TypeNameRegisterer {
-  TypeNameRegisterer() {
+  explicit TypeNameRegisterer(CaffeTypeId id) {
 #ifdef __GXX_RTTI
     string name = Demangle(typeid(T).name());
-    gTypeNames()[reinterpret_cast<CaffeTypeId>(Id())] = name;
+    gTypeNames()[reinterpret_cast<CaffeTypeId>(id)] = name;
     // If we are in RTTI mode, we will also use this opportunity to do sanity
     // check if there are duplicated ids registered for the same type. This
     // usually happens when one does not do RTLD_GLOBAL, which is often the
@@ -42,20 +42,15 @@
     if (gRegisteredTypeNames().count(name)) {
       std::cerr << "Type name " << name
                 << " registered twice. This should "
-                   "not happen. Are you using RTLD_GLOBAL correctly?"
+                   "not happen. Do you have duplicated CAFFE_KNOWN_TYPE?"
                 << std::endl;
       throw std::runtime_error("TypeNameRegisterer error with type " + name);
     }
     gRegisteredTypeNames().insert(name);
-#else  // __GXX_RTTI
-    gTypeNames()[reinterpret_cast<CaffeTypeId>(Id())] =
+#else // __GXX_RTTI
+    gTypeNames()[reinterpret_cast<CaffeTypeId>(id)] =
         "(RTTI disabled, cannot show name)";
-#endif  // __GXX_RTTI
-  }
-
-  static CaffeTypeId Id() {
-    static bool type_id_bit[1];
-    return reinterpret_cast<CaffeTypeId>(type_id_bit);
+#endif // __GXX_RTTI
   }
 };
 
@@ -73,20 +68,24 @@
   /** Create a dummy TypeMeta object. To create a TypeMeta object for a specific
    * type, use TypeMeta::Make<T>().
    */
-  TypeMeta() : id_(0), itemsize_(0), ctor_(nullptr), copy_(nullptr),
-               dtor_(nullptr) {}
+  TypeMeta()
+      : id_(0), itemsize_(0), ctor_(nullptr), copy_(nullptr), dtor_(nullptr) {}
 
   /**
    * Copy constructor.
    */
   TypeMeta(const TypeMeta& src)
-      : id_(src.id_), itemsize_(src.itemsize_),
-        ctor_(src.ctor_), copy_(src.copy_), dtor_(src.dtor_) {}
+      : id_(src.id_),
+        itemsize_(src.itemsize_),
+        ctor_(src.ctor_),
+        copy_(src.copy_),
+        dtor_(src.dtor_) {}
   /**
    * Assignment operator.
    */
   TypeMeta& operator=(const TypeMeta& src) {
-    if (this == &src) return *this;
+    if (this == &src)
+      return *this;
     id_ = src.id_;
     itemsize_ = src.itemsize_;
     ctor_ = src.ctor_;
@@ -98,31 +97,45 @@
  private:
   // TypeMeta can only be created by Make, making sure that we do not
   // create incorrectly mixed up TypeMeta objects.
-  TypeMeta(CaffeTypeId i, size_t s, PlacementNew ctor, TypedCopy copy,
-           TypedDestructor dtor)
+  TypeMeta(
+      CaffeTypeId i,
+      size_t s,
+      PlacementNew ctor,
+      TypedCopy copy,
+      TypedDestructor dtor)
       : id_(i), itemsize_(s), ctor_(ctor), copy_(copy), dtor_(dtor) {}
 
  public:
   /**
    * Returns the type id.
    */
-  inline const CaffeTypeId& id() const { return id_; }
+  inline const CaffeTypeId& id() const {
+    return id_;
+  }
   /**
    * Returns the size of the item.
    */
-  inline const size_t& itemsize() const { return itemsize_; }
+  inline const size_t& itemsize() const {
+    return itemsize_;
+  }
   /**
    * Returns the placement new function pointer for individual items.
    */
-  inline PlacementNew ctor() const { return ctor_; }
+  inline PlacementNew ctor() const {
+    return ctor_;
+  }
   /**
    * Returns the typed copy function pointer for individual iterms.
    */
-  inline TypedCopy copy() const { return copy_; }
+  inline TypedCopy copy() const {
+    return copy_;
+  }
   /**
    * Returns the destructor function pointer for individual items.
    */
-  inline TypedDestructor dtor() const { return dtor_; }
+  inline TypedDestructor dtor() const {
+    return dtor_;
+  }
   /**
    * Returns a printable name for the type.
    */
@@ -131,11 +144,17 @@
     assert(it != gTypeNames().end());
     return it->second.c_str();
   }
-  inline bool operator==(const TypeMeta& m) const { return (id_ == m.id_); }
-  inline bool operator!=(const TypeMeta& m) const { return (id_ != m.id_); }
+  inline bool operator==(const TypeMeta& m) const {
+    return (id_ == m.id_);
+  }
+  inline bool operator!=(const TypeMeta& m) const {
+    return (id_ != m.id_);
+  }
 
   template <typename T>
-  inline bool Match() const { return (id_ == Id<T>()); }
+  inline bool Match() const {
+    return (id_ == Id<T>());
+  }
 
   // Below are static functions that can be called by passing a specific type.
 
@@ -147,22 +166,29 @@
    * is generated during run-time. Do NOT serialize the id for storage.
    */
   template <typename T>
-  static CaffeTypeId Id() {
-    static TypeNameRegisterer<T> registerer;
-    return TypeNameRegisterer<T>::Id();
-  }
+  static CaffeTypeId Id();
+
   /**
    * Returns the item size of the type. This is equivalent to sizeof(T).
    */
   template <typename T>
-  static size_t ItemSize() { return sizeof(T); }
+  static size_t ItemSize() {
+    return sizeof(T);
+  }
 
   /**
    * Returns the printable name of the type.
+   *
+   * Works for all types, not only the ones registered with CAFFE_KNOWN_TYPE
    */
   template <typename T>
   static const char* Name() {
-    return gTypeNames()[Id<T>()].c_str();
+#ifdef __GXX_RTTI
+    static string name = Demangle(typeid(T).name());
+    return name.c_str();
+#else // __GXX_RTTI
+    return "(RTTI disabled, cannot show name)";
+#endif // __GXX_RTTI
   }
 
   /**
@@ -172,11 +198,10 @@
   static void _Ctor(void* ptr, size_t n) {
     T* typed_ptr = static_cast<T*>(ptr);
     for (int i = 0; i < n; ++i) {
-      new(typed_ptr + i) T;
+      new (typed_ptr + i) T;
     }
   }
 
-
   /**
    * Typed copy function for classes.
    */
@@ -219,20 +244,20 @@
     return TypeMeta(Id<T>(), ItemSize<T>(), nullptr, nullptr, nullptr);
   }
 
-  template <typename T,
-            typename std::enable_if<
-                !std::is_fundamental<T>::value &&
-                std::is_copy_assignable<T>::value>::type* = nullptr>
+  template <
+      typename T,
+      typename std::enable_if<
+          !std::is_fundamental<T>::value &&
+          std::is_copy_assignable<T>::value>::type* = nullptr>
   static TypeMeta Make() {
-    return TypeMeta(
-        Id<T>(), ItemSize<T>(), _Ctor<T>, _Copy<T>, _Dtor<T>);
+    return TypeMeta(Id<T>(), ItemSize<T>(), _Ctor<T>, _Copy<T>, _Dtor<T>);
   }
 
   template <typename T>
   static TypeMeta Make(
       typename std::enable_if<
-          !std::is_fundamental<T>::value && !std::is_copy_assignable<T>::value
-      >::type* = 0) {
+          !std::is_fundamental<T>::value &&
+          !std::is_copy_assignable<T>::value>::type* = 0) {
     return TypeMeta(
         Id<T>(), ItemSize<T>(), _Ctor<T>, _CopyNotAllowed<T>, _Dtor<T>);
   }
@@ -245,6 +270,28 @@
   TypedDestructor dtor_;
 };
 
-}  // namespace caffe2
+/**
+ * Register unique id for a type so it can be used in TypeMeta context, e.g. be
+ * used as a type for Blob or for Tensor elements.
+ *
+ * CAFFE_KNOWN_TYPE does explicit instantiation of TypeMeta::Id<T> template
+ * function and thus needs to be put in a single translation unit (.cpp file)
+ * for a given type T. Other translation units that use type T as a type of the
+ * caffe2::Blob or element type of caffe2::Tensor need to depend on the
+ * translation unit that contains CAFFE_KNOWN_TYPE declaration via regular
+ * linkage dependencies.
+ *
+ * NOTE: the macro needs to be invoked in ::caffe2 namespace
+ */
+#define CAFFE_KNOWN_TYPE(T)                            \
+  template <>                                          \
+  CaffeTypeId TypeMeta::Id<T>() {                      \
+    static bool type_id_bit[1];                        \
+    static TypeNameRegisterer<T> registerer(           \
+        reinterpret_cast<CaffeTypeId>(type_id_bit));   \
+    return reinterpret_cast<CaffeTypeId>(type_id_bit); \
+  }
 
-#endif  // CAFFE2_CORE_TYPEID_H_
+} // namespace caffe2
+
+#endif // CAFFE2_CORE_TYPEID_H_
diff --git a/caffe2/core/typeid_test.cc b/caffe2/core/typeid_test.cc
index 0d18964..171bca0 100644
--- a/caffe2/core/typeid_test.cc
+++ b/caffe2/core/typeid_test.cc
@@ -7,6 +7,12 @@
 
 class TypeMetaTestFoo {};
 class TypeMetaTestBar {};
+}
+
+CAFFE_KNOWN_TYPE(TypeMetaTestFoo);
+CAFFE_KNOWN_TYPE(TypeMetaTestBar);
+
+namespace {
 
 TEST(TypeMetaTest, TypeMetaStatic) {
   EXPECT_EQ(TypeMeta::ItemSize<int>(), sizeof(int));
@@ -63,10 +69,10 @@
   EXPECT_EQ(float_meta.itemsize(), TypeMeta::ItemSize<float>());
   EXPECT_EQ(foo_meta.itemsize(), TypeMeta::ItemSize<TypeMetaTestFoo>());
   EXPECT_EQ(bar_meta.itemsize(), TypeMeta::ItemSize<TypeMetaTestBar>());
-  EXPECT_EQ(int_meta.name(), TypeMeta::Name<int>());
-  EXPECT_EQ(float_meta.name(), TypeMeta::Name<float>());
-  EXPECT_EQ(foo_meta.name(), TypeMeta::Name<TypeMetaTestFoo>());
-  EXPECT_EQ(bar_meta.name(), TypeMeta::Name<TypeMetaTestBar>());
+  EXPECT_STREQ(int_meta.name(), TypeMeta::Name<int>());
+  EXPECT_STREQ(float_meta.name(), TypeMeta::Name<float>());
+  EXPECT_STREQ(foo_meta.name(), TypeMeta::Name<TypeMetaTestFoo>());
+  EXPECT_STREQ(bar_meta.name(), TypeMeta::Name<TypeMetaTestBar>());
 }
 
 
@@ -85,6 +91,12 @@
   ClassNoAssignment& operator=(const ClassNoAssignment& src) = delete;
   int x;
 };
+}
+
+CAFFE_KNOWN_TYPE(ClassAllowAssignment);
+CAFFE_KNOWN_TYPE(ClassNoAssignment);
+
+namespace {
 
 TEST(TypeMetaTest, CtorDtorAndCopy) {
   TypeMeta fundamental_meta = TypeMeta::Make<int>();
diff --git a/caffe2/core/types.cc b/caffe2/core/types.cc
index d71eb41..b582872 100644
--- a/caffe2/core/types.cc
+++ b/caffe2/core/types.cc
@@ -1,8 +1,29 @@
 #include "caffe2/core/types.h"
 #include "caffe2/core/typeid.h"
 
+#include <atomic>
+#include <memory>
+#include <string>
+#include <vector>
+
 namespace caffe2 {
 
+CAFFE_KNOWN_TYPE(float);
+CAFFE_KNOWN_TYPE(int);
+CAFFE_KNOWN_TYPE(std::string);
+CAFFE_KNOWN_TYPE(bool);
+CAFFE_KNOWN_TYPE(uint8_t);
+CAFFE_KNOWN_TYPE(int8_t);
+CAFFE_KNOWN_TYPE(uint16_t);
+CAFFE_KNOWN_TYPE(int16_t);
+CAFFE_KNOWN_TYPE(int64_t);
+CAFFE_KNOWN_TYPE(float16);
+CAFFE_KNOWN_TYPE(double);
+CAFFE_KNOWN_TYPE(char);
+CAFFE_KNOWN_TYPE(std::unique_ptr<std::mutex>);
+CAFFE_KNOWN_TYPE(std::unique_ptr<std::atomic<bool>>);
+CAFFE_KNOWN_TYPE(std::vector<int64_t>);
+
 TensorProto::DataType TypeMetaToDataType(const TypeMeta& meta) {
   static_assert(sizeof(int) == 4,
                 "int in this compiler does not equal to 4 bytes.");
diff --git a/caffe2/core/workspace.cc b/caffe2/core/workspace.cc
index c16ec59..9cd006d 100644
--- a/caffe2/core/workspace.cc
+++ b/caffe2/core/workspace.cc
@@ -2,6 +2,7 @@
 
 #include <algorithm>
 #include <ctime>
+#include <mutex>
 
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
@@ -290,14 +291,24 @@
         auto substepShouldContinue = [&, externalShouldContinue](int64_t iter) {
           return !got_failure && externalShouldContinue(iter);
         };
+        std::mutex exception_mutex;
+        std::exception_ptr first_exception;
         auto worker = [&]() {
           while (true) {
             int substep_id = next_substep++;
             if (got_failure || (substep_id >= step.substep().size())) {
               break;
             }
-            if (!ExecuteStepRecursive(
-                    step.substep().Get(substep_id), substepShouldContinue)) {
+            try {
+              if (!ExecuteStepRecursive(
+                      step.substep().Get(substep_id), substepShouldContinue)) {
+                got_failure = true;
+              }
+            } catch (const std::exception& ex) {
+              std::lock_guard<std::mutex> guard(exception_mutex);
+              if (!first_exception) {
+                first_exception = std::current_exception();
+              }
               got_failure = true;
             }
           }
@@ -311,6 +322,10 @@
           thread.join();
         }
         if (got_failure) {
+          LOG(ERROR) << "One of the workers died with an unhandled exception";
+          if (first_exception != nullptr) {
+            std::rethrow_exception(first_exception);
+          }
           return false;
         }
         // concurrent substeps should be careful about setting should_stop_blob
diff --git a/caffe2/core/workspace.h b/caffe2/core/workspace.h
index 31ae3e6..ad43296 100644
--- a/caffe2/core/workspace.h
+++ b/caffe2/core/workspace.h
@@ -18,12 +18,18 @@
 class NetBase;
 
 struct StopOnSignal {
-  StopOnSignal(): handler_(SignalHandler::Action::STOP,
-                           SignalHandler::Action::STOP) {}
+  StopOnSignal()
+      : handler_(std::make_shared<SignalHandler>(
+            SignalHandler::Action::STOP,
+            SignalHandler::Action::STOP)) {}
+
+  StopOnSignal(const StopOnSignal& other) : handler_(other.handler_) {}
+
   bool operator()(int iter) {
-    return handler_.CheckForSignals() != SignalHandler::Action::STOP;
+    return handler_->CheckForSignals() != SignalHandler::Action::STOP;
   }
-  SignalHandler handler_;
+
+  std::shared_ptr<SignalHandler> handler_;
 };
 
 /**
diff --git a/caffe2/core/workspace_test.cc b/caffe2/core/workspace_test.cc
index 91fbd18..5ac3f44 100644
--- a/caffe2/core/workspace_test.cc
+++ b/caffe2/core/workspace_test.cc
@@ -8,6 +8,8 @@
 
 class WorkspaceTestFoo {};
 
+CAFFE_KNOWN_TYPE(WorkspaceTestFoo);
+
 TEST(WorkspaceTest, BlobAccess) {
   Workspace ws;
 
@@ -71,5 +73,3 @@
 }
 
 }  // namespace caffe2
-
-
diff --git a/caffe2/cuda_rtc/common_rtc.h b/caffe2/cuda_rtc/common_rtc.h
index 3824a24..7ab419f 100644
--- a/caffe2/cuda_rtc/common_rtc.h
+++ b/caffe2/cuda_rtc/common_rtc.h
@@ -73,8 +73,8 @@
               unsigned int bx, unsigned int by, unsigned int bz,
               unsigned int shared_mem, cudaStream_t stream,
               Args... args) {
-    CHECK(module_loaded_)
-        << "Cannot call Launch before a module is loaded.";
+    CAFFE_ENFORCE(
+        module_loaded_, "Cannot call Launch before a module is loaded.");
     void * args_voidp[] = {&args...};
     CUDA_DRIVERAPI_CHECK(cuLaunchKernel(
         kernel_, gx, gy, gz, bx, by, bz, shared_mem, stream,
@@ -85,8 +85,8 @@
                 unsigned int bx, unsigned int by, unsigned int bz,
                 unsigned int shared_mem, cudaStream_t stream,
                 void** extra) {
-    CHECK(module_loaded_)
-        << "Cannot call Launch before a module is loaded.";
+    CAFFE_ENFORCE(
+        module_loaded_, "Cannot call Launch before a module is loaded.");
     CUDA_DRIVERAPI_CHECK(cuLaunchKernel(
         kernel_, gx, gy, gz, bx, by, bz, shared_mem, stream,
         nullptr, extra));
diff --git a/caffe2/cuda_rtc/elemenntwise_rtc_gpu.cc b/caffe2/cuda_rtc/elemenntwise_rtc_gpu.cc
index 71d36ab..b766ff4 100644
--- a/caffe2/cuda_rtc/elemenntwise_rtc_gpu.cc
+++ b/caffe2/cuda_rtc/elemenntwise_rtc_gpu.cc
@@ -75,7 +75,7 @@
       : Operator<CUDAContext>(operator_def, ws) {
     const string src = OperatorBase::GetSingleArgument<string>(
         "rtc_src", "");
-    CHECK(src.size()) << "Op should have a non-zero source code size.";
+    CAFFE_ENFORCE(src.size(), "Op should have a non-zero source code size.");
     func_.Compile(InputSize(), OutputSize(), src);
   }
   ~ElementwiseRTCOp() {}
@@ -85,8 +85,9 @@
                   "The argbuffer relies on the assumption that void* and "
                   "size_t have the same size.");
     size_t argBuffer[InputSize() + OutputSize() + 1];
-    CHECK(Input(0).size() < std::numeric_limits<int>::max())
-        << "The kernel function currently only supports int index.";
+    CAFFE_ENFORCE(
+        Input(0).size() < std::numeric_limits<int>::max(),
+        "The kernel function currently only supports int index.");
     argBuffer[0] = Input(0).size();
     void** ptr_buffer = reinterpret_cast<void**>(argBuffer + 1);
     for (int i = 0; i < InputSize(); ++i) {
diff --git a/caffe2/db/create_db_op.h b/caffe2/db/create_db_op.h
index bc345be..2650ad2 100644
--- a/caffe2/db/create_db_op.h
+++ b/caffe2/db/create_db_op.h
@@ -15,18 +15,25 @@
         db_type_(OperatorBase::template GetSingleArgument<string>(
             "db_type",
             "leveldb")),
-        db_name_(OperatorBase::template GetSingleArgument<string>("db", "")) {
+        db_name_(OperatorBase::template GetSingleArgument<string>("db", "")),
+        num_shards_(
+            OperatorBase::template GetSingleArgument<int>("num_shards", 1)),
+        shard_id_(
+            OperatorBase::template GetSingleArgument<int>("shard_id", 0)) {
     CHECK_GT(db_name_.size(), 0) << "Must specify a db name.";
   }
 
   bool RunOnDevice() final {
-    OperatorBase::Output<db::DBReader>(0)->Open(db_type_, db_name_);
+    OperatorBase::Output<db::DBReader>(0)->Open(
+        db_type_, db_name_, num_shards_, shard_id_);
     return true;
   }
 
  private:
   string db_type_;
   string db_name_;
+  uint32_t num_shards_;
+  uint32_t shard_id_;
   DISABLE_COPY_AND_ASSIGN(CreateDBOp);
 };
 
diff --git a/caffe2/db/db_test.cc b/caffe2/db/db_test.cc
index e0e9820..c769c65 100644
--- a/caffe2/db/db_test.cc
+++ b/caffe2/db/db_test.cc
@@ -151,5 +151,50 @@
   EXPECT_EQ(keys_set.size(), kMaxItems);
 }
 
+TEST(DBReaderShardedTest, Reader) {
+  std::string name = std::tmpnam(nullptr);
+  CreateAndFill("leveldb", name);
+
+  std::unique_ptr<DBReader> reader0(new DBReader("leveldb", name, 3, 0));
+  string key;
+  string value;
+  reader0->Read(&key, &value);
+  EXPECT_EQ(key, "00");
+  EXPECT_EQ(value, "00");
+  reader0->Read(&key, &value);
+  EXPECT_EQ(key, "03");
+  EXPECT_EQ(value, "03");
+  reader0->Read(&key, &value);
+  EXPECT_EQ(key, "06");
+  EXPECT_EQ(value, "06");
+  reader0->Read(&key, &value);
+  EXPECT_EQ(key, "09");
+  EXPECT_EQ(value, "09");
+  reader0->Read(&key, &value);
+  EXPECT_EQ(key, "00");
+  EXPECT_EQ(value, "00");
+  reader0->Read(&key, &value);
+  EXPECT_EQ(key, "03");
+  EXPECT_EQ(value, "03");
+
+  CreateAndFill("leveldb", name + "1");
+  std::unique_ptr<DBReader> reader1(new DBReader("leveldb", name + "1", 3, 1));
+  reader1->Read(&key, &value);
+  EXPECT_EQ(key, "01");
+  EXPECT_EQ(value, "01");
+  reader1->Read(&key, &value);
+  EXPECT_EQ(key, "04");
+  EXPECT_EQ(value, "04");
+
+  CreateAndFill("leveldb", name + "2");
+  std::unique_ptr<DBReader> reader2(new DBReader("leveldb", name + "2", 3, 2));
+  reader2->Read(&key, &value);
+  EXPECT_EQ(key, "02");
+  EXPECT_EQ(value, "02");
+  reader2->Read(&key, &value);
+  EXPECT_EQ(key, "05");
+  EXPECT_EQ(value, "05");
+}
+
 }  // namespace db
 }  // namespace caffe2
diff --git a/caffe2/db/rocksdb.cc b/caffe2/db/rocksdb.cc
index af6539c..61753e0 100644
--- a/caffe2/db/rocksdb.cc
+++ b/caffe2/db/rocksdb.cc
@@ -42,8 +42,8 @@
   void Commit() override {
     rocksdb::Status status = db_->Write(rocksdb::WriteOptions(), batch_.get());
     batch_.reset(new rocksdb::WriteBatch());
-    CHECK(status.ok()) << "Failed to write batch to rocksdb "
-                       << std::endl << status.ToString();
+    CAFFE_ENFORCE(
+        status.ok(), "Failed to write batch to rocksdb: " + status.ToString());
   }
 
  private:
diff --git a/caffe2/image/image_input_op.h b/caffe2/image/image_input_op.h
index 5be044e..dbc09b3 100644
--- a/caffe2/image/image_input_op.h
+++ b/caffe2/image/image_input_op.h
@@ -110,7 +110,7 @@
   if (use_caffe_datum_) {
     // The input is a caffe datum format.
     caffe::Datum datum;
-    CHECK(datum.ParseFromString(value));
+    CAFFE_ENFORCE(datum.ParseFromString(value));
     *label = datum.label();
     if (datum.encoded()) {
       // encoded image in datum.
@@ -123,8 +123,8 @@
       *img = cv::Mat(datum.height(), datum.width(),
                      color_ ? CV_8UC3 : CV_8UC1);
       // Note(Yangqing): I believe that the mat should be created continuous.
-      CHECK(img->isContinuous());
-      CHECK((color_ && datum.channels() == 3) || datum.channels() == 1);
+      CAFFE_ENFORCE(img->isContinuous());
+      CAFFE_ENFORCE((color_ && datum.channels() == 3) || datum.channels() == 1);
       if (datum.channels() == 1) {
         memcpy(img->ptr<uchar>(0), datum.data().data(), datum.data().size());
       } else {
@@ -146,7 +146,7 @@
   } else {
     // The input is a caffe2 format.
     TensorProtos protos;
-    CHECK(protos.ParseFromString(value));
+    CAFFE_ENFORCE(protos.ParseFromString(value));
     const TensorProto& image_proto = protos.protos(0);
     const TensorProto& label_proto = protos.protos(1);
     if (image_proto.data_type() == TensorProto::STRING) {
@@ -166,7 +166,7 @@
           << "Image height must be bigger than crop.";
       CHECK_GE(image_proto.dims(1), crop_)
           << "Image width must be bigger than crop.";
-      CHECK(!color_ || image_proto.dims(2) == 3);
+      CAFFE_ENFORCE(!color_ || image_proto.dims(2) == 3);
       *img = cv::Mat(
           image_proto.dims(0), image_proto.dims(1), color_ ? CV_8UC3 : CV_8UC1);
       memcpy(img->ptr<uchar>(0), image_proto.byte_data().data(),
@@ -214,7 +214,7 @@
     cv::Mat scaled_img;
     // process data
     reader_->Read(&key, &value);
-    CHECK(GetImageAndLabelFromDBValue(value, &img, &label));
+    CAFFE_ENFORCE(GetImageAndLabelFromDBValue(value, &img, &label));
     // deal with scaling.
     int scaled_width, scaled_height;
     if (warp_) {
diff --git a/caffe2/mpi/mpi_common.cc b/caffe2/mpi/mpi_common.cc
index 4ee1aec..65f749a 100644
--- a/caffe2/mpi/mpi_common.cc
+++ b/caffe2/mpi/mpi_common.cc
@@ -2,10 +2,13 @@
 
 #include <thread>
 
+#include "caffe2/core/typeid.h"
 #include "caffe2/utils/proto_utils.h"
 
 namespace caffe2 {
 
+CAFFE_KNOWN_TYPE(MPICommonWorldWrapper);
+
 static std::mutex gCaffe2MPIMutex;
 
 std::mutex& MPIMutex() {
diff --git a/caffe2/mpi/mpi_ops_gpu.cc b/caffe2/mpi/mpi_ops_gpu.cc
index 53080fd..6383bc9 100644
--- a/caffe2/mpi/mpi_ops_gpu.cc
+++ b/caffe2/mpi/mpi_ops_gpu.cc
@@ -9,41 +9,38 @@
 // version supports CUDA aware MPI functions or not.
 
 #if OPEN_MPI
-  #define CAFFE2_OMPI_VERSION \
-    OMPI_MAJOR_VERSION * 10000 + OMPI_MINOR_VERSION * 100 + \
-    OMPI_RELEASE_VERSION
-  #if CAFFE2_OMPI_VERSION >= 20000
-    // openmpi 2.x now supports compile time check whether cuda support is
-    // built with openmpi.
-    #include "mpi-ext.h" /* Needed for CUDA-aware check */
-    #if MPIX_CUDA_AWARE_SUPPORT
+#define CAFFE2_OMPI_VERSION \
+  OMPI_MAJOR_VERSION * 10000 + OMPI_MINOR_VERSION * 100 + OMPI_RELEASE_VERSION
+#if CAFFE2_OMPI_VERSION >= 20000
+// OpenMPI 2.x now supports compile time check whether CUDA is supported.
+#include "mpi-ext.h" /* Needed for CUDA-aware check */
+#if MPIX_CUDA_AWARE_SUPPORT
 #define CAFFE2_HAS_CUDA_MPI_BASICS 1
 #define CAFFE2_HAS_CUDA_MPI_ALLREDUCE 1
 #endif // MPIX_CUDA_AWARE_SUPPORT
 #else // CAFFE2_OMPI_VERSION >= 2000
-// In the case of openmpi 1.x, we don't have compile-time flags to figure
-// out if cuda is built; as a result, we will assume that the user has built
-// openmpi with cuda.
-// CUDA-aware MPIBroadcast is introduced after openmpi 1.7.
+// In the case of OpenMPI 1.x, we don't have compile-time flags to
+// figure out if CUDA is supported; as a result, we will assume that
+// the user has built OpenMPI with CUDA support.
+// CUDA-aware MPIBroadcast is introduced after OpenMPI 1.7.
 #if CAFFE2_OMPI_VERSION >= 10700
 #define CAFFE2_HAS_CUDA_MPI_BASICS 1
 #else // CAFFE2_OMPI_VERSION >= 10700
 #define CAFFE2_HAS_CUDA_MPI_BASICS 0
 #endif // CAFFE2_OMPI_VERSION >= 10700
-
-// CUDA-aware MPIAllreduce is introduced after openmpi 1.8.5.
+// CUDA-aware MPIAllreduce is introduced after OpenMPI 1.8.5.
 #if CAFFE2_OMPI_VERSION >= 10805
 #define CAFFE2_HAS_CUDA_MPI_ALLREDUCE 1
 #else // CAFFE2_OMPI_VERSION >= 10805
 #define CAFFE2_HAS_CUDA_MPI_ALLREDUCE 0
 #endif // CAFFE2_OMPI_VERSION >= 10805
 #endif // CAFFE2_OMPI_VERSION >= 2000
-#else  // !OPEN_MPI
-  // We have not really tested against other MPI environments, so let's go for a
-  // safe path and basically say we don't have cuda-aware functions.
+#else // !OPEN_MPI
+// We have not really tested against other MPI environments, so let's go for a
+// safe path and basically say we don't have cuda-aware functions.
 #define CAFFE2_HAS_CUDA_MPI_BASICS 0
 #define CAFFE2_HAS_CUDA_MPI_ALLREDUCE 0
-#endif  // OPEN_MPI
+#endif // OPEN_MPI
 
 // We allow a macro to force using fallback functions.
 #ifdef CAFFE2_FORCE_FALLBACK_CUDA_MPI
@@ -51,7 +48,7 @@
 #undef CAFFE2_HAS_CUDA_MPI_ALLREDUCE
 #define CAFFE2_HAS_CUDA_MPI_BASICS 0
 #define CAFFE2_HAS_CUDA_MPI_ALLREDUCE 0
-#endif  // CAFFE2_FORCE_FALLBACK_CUDA_MPI
+#endif // CAFFE2_FORCE_FALLBACK_CUDA_MPI
 
 namespace {
 
diff --git a/caffe2/mpi/mpi_python.cc b/caffe2/mpi/mpi_python.cc
new file mode 100644
index 0000000..b82ef32
--- /dev/null
+++ b/caffe2/mpi/mpi_python.cc
@@ -0,0 +1,36 @@
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "caffe2/caffe2/mpi/mpi_common.h"
+
+namespace caffe2 {
+
+namespace py = pybind11;
+
+PYBIND11_PLUGIN(mpi) {
+  py::module m("mpi", "MPI helper functions");
+  m.def(
+      "SetupPeers",
+      &MPISetupPeers,
+      py::arg("replicas"),
+      py::arg("role"),
+      py::arg("job_path"));
+  m.def("CommSize", [] {
+    auto comm = GlobalMPIComm();
+    return MPICommSize(comm);
+  });
+  m.def("CommRank", [] {
+    auto comm = GlobalMPIComm();
+    return MPICommRank(comm);
+  });
+  m.def("Finalize", [] {
+    // NOTE(pietern): Doesn't seem to work when calling it
+    // from Python. It ends up calling pthread_join on a
+    // thread that doesn't exit. For now, running mpirun
+    // with `-quiet` and skipping the finalize call.
+    MPI_Finalize();
+  });
+  return m.ptr();
+}
+
+} // namespace caffe2
diff --git a/caffe2/operators/batch_matmul_op.cc b/caffe2/operators/batch_matmul_op.cc
new file mode 100644
index 0000000..5a44691
--- /dev/null
+++ b/caffe2/operators/batch_matmul_op.cc
@@ -0,0 +1,124 @@
+#include "caffe2/operators/batch_matmul_op.h"
+
+namespace caffe2 {
+namespace {
+
+REGISTER_CPU_OPERATOR(BatchMatMul, BatchMatMulOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(BatchMatMul)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Batch Matrix multiplication Yi = Ai * Bi, where A has size (C x M x K), B has
+size (C x K x N) where C is the batch size and i ranges from 0 to C-1.
+)DOC")
+    .Input(0, "A", "3D matrix of size (C x M x K)")
+    .Input(1, "B", "3D matrix of size (C x K x N)")
+    .Output(0, "Y", "3D matrix of size (C x M x N)")
+    .Arg("trans_a", "Pass 1 to transpose A before multiplication")
+    .Arg("trans_b", "Pass 1 to transpose B before multiplication");
+
+class GetBatchMatMulGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    CHECK_EQ(def_.input_size(), 2);
+
+    bool trans_a = 0;
+    bool trans_b = 0;
+
+    if (HasArgument(Def(), "trans_a")) {
+      trans_a = GetArgument(Def(), "trans_a").i();
+    }
+    if (HasArgument(Def(), "trans_b")) {
+      trans_b = GetArgument(Def(), "trans_b").i();
+    }
+
+    const auto no_trans_arg = vector<Argument>();
+    const auto trans_a_arg = vector<Argument>{
+        MakeArgument<int>("trans_a", 1)};
+    const auto trans_b_arg = vector<Argument>{
+        MakeArgument<int>("trans_b", 1)};
+    const auto trans_both_arg = vector<Argument>{
+        MakeArgument<int>("trans_a", 1),
+        MakeArgument<int>("trans_b", 1)};
+
+    if (trans_a) {
+      if (trans_b) {
+        // A'B':
+        // dA = B'G', dB = G'A'
+        return vector<OperatorDef>{
+            CreateOperatorDef(
+                "BatchMatMul",
+                "",
+                vector<string>{I(1), GO(0)},
+                vector<string>{GI(0)},
+                trans_both_arg),
+            CreateOperatorDef(
+                "BatchMatMul",
+                "",
+                vector<string>{GO(0), I(0)},
+                vector<string>{GI(1)},
+                trans_both_arg)};
+      } else {
+        // A'B:
+        // dA = BG', dB = AG
+        return vector<OperatorDef>{
+            CreateOperatorDef(
+                "BatchMatMul",
+                "",
+                vector<string>{I(1), GO(0)},
+                vector<string>{GI(0)},
+                trans_b_arg),
+            CreateOperatorDef(
+                "BatchMatMul",
+                "",
+                vector<string>{I(0), GO(0)},
+                vector<string>{GI(1)},
+                no_trans_arg)};
+      }
+    } else {
+      if (trans_b) {
+        // AB':
+        // dA = GB, dB = G'A
+        return vector<OperatorDef>{
+            CreateOperatorDef(
+                "BatchMatMul",
+                "",
+                vector<string>{GO(0), I(1)},
+                vector<string>{GI(0)},
+                no_trans_arg),
+            CreateOperatorDef(
+                "BatchMatMul",
+                "",
+                vector<string>{GO(0), I(0)},
+                vector<string>{GI(1)},
+                trans_a_arg)};
+      } else {
+        // AB:
+        // dA = GB', dB = A'G
+        return vector<OperatorDef>{
+            CreateOperatorDef(
+                "BatchMatMul",
+                "",
+                vector<string>{GO(0), I(1)},
+                vector<string>{GI(0)},
+                trans_b_arg),
+            CreateOperatorDef(
+                "BatchMatMul",
+                "",
+                vector<string>{I(0), GO(0)},
+                vector<string>{GI(1)},
+                trans_a_arg)};
+      }
+    }
+  }
+
+  bool CopyArguments() const override {
+    return false;
+  }
+};
+
+REGISTER_GRADIENT(BatchMatMul, GetBatchMatMulGradient);
+
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/operators/batch_matmul_op.h b/caffe2/operators/batch_matmul_op.h
new file mode 100644
index 0000000..33e57c8
--- /dev/null
+++ b/caffe2/operators/batch_matmul_op.h
@@ -0,0 +1,90 @@
+#ifndef CAFFE2_OPERATORS_MATMUL_OP_H_
+#define CAFFE2_OPERATORS_MATMUL_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context, class Engine = DefaultEngine>
+class BatchMatMulOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  BatchMatMulOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        trans_a_(OperatorBase::GetSingleArgument<int>("trans_a", 0)),
+        trans_b_(OperatorBase::GetSingleArgument<int>("trans_b", 0)) {}
+  ~BatchMatMulOp() {}
+
+  bool RunOnDevice() override {
+    const auto& A = Input(0);
+    const auto& B = Input(1);
+    auto* Y = Output(0);
+
+    CAFFE_ENFORCE_EQ(A.ndim(), 3);
+    CAFFE_ENFORCE_EQ(B.ndim(), 3);
+    CAFFE_ENFORCE_EQ(A.dim32(0), B.dim32(0));
+
+    int a_dim0, a_dim1, b_dim0, b_dim1;
+
+    if (trans_a_) {
+      a_dim0 = A.dim32(2);
+      a_dim1 = A.dim32(1);
+    } else {
+      a_dim0 = A.dim32(1);
+      a_dim1 = A.dim32(2);
+    }
+
+    if (trans_b_) {
+      b_dim0 = B.dim32(2);
+      b_dim1 = B.dim32(1);
+    } else {
+      b_dim0 = B.dim32(1);
+      b_dim1 = B.dim32(2);
+    }
+
+    // Error checking
+    CAFFE_ENFORCE(
+        a_dim1 == b_dim0,
+        "Dimension mismatch: ",
+        trans_a_ ? "trans(A): " : "A: ",
+        a_dim0,
+        " ",
+        a_dim1,
+        trans_b_ ? ", trans(B): " : ", B: ",
+        b_dim0,
+        " ",
+        b_dim1);
+
+    Y->Resize(A.dim(0), a_dim0, b_dim1);
+
+    // Y = A * B
+    auto a_offset = A.size() / A.dim(0);
+    auto b_offset = B.size() / B.dim(0);
+    auto y_offset = a_dim0 * b_dim1;
+    for (int i = 0; i < A.dim32(0); ++i) {
+      math::Gemm<T, Context, Engine>(
+          trans_a_ ? CblasTrans : CblasNoTrans,
+          trans_b_ ? CblasTrans : CblasNoTrans,
+          a_dim0,
+          b_dim1,
+          a_dim1,
+          1,
+          A.template data<T>() + a_offset * i,
+          B.template data<T>() + b_offset * i,
+          0,
+          Y->template mutable_data<T>() + y_offset * i,
+          &context_);
+    }
+    return true;
+  }
+
+ protected:
+  bool trans_a_;
+  bool trans_b_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_MATMUL_OP_H_
diff --git a/caffe2/operators/batch_matmul_op_gpu.cc b/caffe2/operators/batch_matmul_op_gpu.cc
new file mode 100644
index 0000000..bfd77ae
--- /dev/null
+++ b/caffe2/operators/batch_matmul_op_gpu.cc
@@ -0,0 +1,10 @@
+#include "caffe2/operators/batch_matmul_op.h"
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+namespace {
+
+REGISTER_CUDA_OPERATOR(BatchMatMul, BatchMatMulOp<float, CUDAContext>);
+}
+}
diff --git a/caffe2/operators/communicator_op.cc b/caffe2/operators/communicator_op.cc
index 9679dcc..b2ea58b 100644
--- a/caffe2/operators/communicator_op.cc
+++ b/caffe2/operators/communicator_op.cc
@@ -51,7 +51,7 @@
     .NumInputs(2)
     .NumOutputs(1)
     .SetDoc(R"DOC(
-Does an allgather operation among the nodes. Currently only Sum is supported.
+Does an allgather operation among the nodes.
 )DOC")
     .Input(0, "comm_world", "The common world.")
     .Input(1, "X", "A tensor to be allgathered.")
diff --git a/caffe2/operators/concat_split_op.h b/caffe2/operators/concat_split_op.h
index 0ed412b..41845d6 100644
--- a/caffe2/operators/concat_split_op.h
+++ b/caffe2/operators/concat_split_op.h
@@ -30,10 +30,10 @@
   SplitOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<Context>(operator_def, ws),
         split_(OperatorBase::GetRepeatedArgument<int>("split")) {
-    CHECK(OperatorBase::HasArgument("axis") ^
-          OperatorBase::HasArgument("order"))
-        << "You should either specify the dim to split, or the order "
-           "in the case of 4-D images.";
+    CAFFE_ENFORCE(
+        OperatorBase::HasArgument("axis") ^ OperatorBase::HasArgument("order"),
+        "You should either specify the dim to split, or the order "
+        "in the case of 4-D images.");
     if (OperatorBase::HasArgument("axis")) {
       axis_ = OperatorBase::GetSingleArgument<int>("axis", -1);
     } else {
@@ -58,10 +58,10 @@
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   ConcatOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<Context>(operator_def, ws) {
-    CHECK(OperatorBase::HasArgument("axis") ^
-                OperatorBase::HasArgument("order"))
-        << "You should either specify the dim to split, or the order "
-           "in the case of 4-D images.";
+    CAFFE_ENFORCE(
+        OperatorBase::HasArgument("axis") ^ OperatorBase::HasArgument("order"),
+        "You should either specify the dim to split, or the order "
+        "in the case of 4-D images.");
     if (OperatorBase::HasArgument("axis")) {
       axis_ = OperatorBase::GetSingleArgument<int>("axis", -1);
     } else {
diff --git a/caffe2/operators/conv_op_cudnn.cc b/caffe2/operators/conv_op_cudnn.cc
index 19667ab..70892ac 100644
--- a/caffe2/operators/conv_op_cudnn.cc
+++ b/caffe2/operators/conv_op_cudnn.cc
@@ -45,7 +45,7 @@
         deterministic_(
             OperatorBase::GetSingleArgument<int>("deterministic", 0)),
         cudnn_state_(OperatorBase::GetSingleArgument<int>("cudnn_state", 0)) {
-    CHECK(!deterministic_ || !exhaustive_search_);
+    CAFFE_ENFORCE(!deterministic_ || !exhaustive_search_);
     CUDNN_CHECK(cudnnCreateTensorDescriptor(&bottom_desc_));
     CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
     CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc_));
diff --git a/caffe2/operators/conv_op_impl.h b/caffe2/operators/conv_op_impl.h
index c6e1341..d25ece8 100644
--- a/caffe2/operators/conv_op_impl.h
+++ b/caffe2/operators/conv_op_impl.h
@@ -3,24 +3,33 @@
 #define CAFFE2_OPERATORS_CONV_OP_IMPL_H_
 
 #include "caffe2/core/context.h"
+#include "caffe2/core/flags.h"
+#include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/operators/conv_op.h"
+#include "caffe2/operators/conv_op_shared.h"
 #include "caffe2/operators/conv_pool_op_base.h"
 #include "caffe2/utils/math.h"
-#include "caffe2/core/logging.h"
+
+CAFFE2_DECLARE_bool(caffe2_force_shared_col_buffer);
 
 namespace caffe2 {
 
 template <typename T, class Context>
 bool ConvOp<T, Context>::RunOnDeviceWithOrderNCHW() {
-  auto& X = Input(INPUT);
+  const Tensor<Context>& X = Input(INPUT);
   auto& filter = Input(FILTER);
   auto& bias = Input(BIAS);
-  auto* Y = Output(0);
+  Tensor<Context>* Y = Output(0);
   const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
   CAFFE_ENFORCE(4 == filter.ndim());
   const int M = filter.dim32(0);
-  CAFFE_ENFORCE(C == filter.dim32(1));
+  CAFFE_ENFORCE(
+      C == filter.dim32(1),
+      "Convolution op: # of input channels does not match: # of input channels ",
+      C,
+      " is not equal to kernel channels:",
+      filter.dim32(1));
   CAFFE_ENFORCE(filter.dim32(2) == kernel_h_);
   CAFFE_ENFORCE(filter.dim32(3) == kernel_w_);
   CAFFE_ENFORCE(bias.ndim() == 1);
@@ -36,51 +45,77 @@
   const int output_image_size = Y->dim32(2) * Y->dim32(3);
   // The col buffer is stored in CHW order as well - kernel_dim, and the height
   // and width.
-  col_buffer_.Resize(vector<TIndex>{
-      C, kernel_h_, kernel_w_, Y->dim32(2), Y->dim32(3)});
+  const T* Xdata = X.template data<T>();
   if (bias_multiplier_.size() != output_image_size) {
     // If the helper bias multiplier is not M, reshape and fill it with one.
     bias_multiplier_.Resize(vector<TIndex>(1, output_image_size));
     math::Set<T, Context>(
-        output_image_size, static_cast<T>(1),
-        bias_multiplier_.template mutable_data<T>(), &context_);
+        output_image_size,
+        static_cast<T>(1),
+        bias_multiplier_.template mutable_data<T>(),
+        &context_);
   }
-  const T* Xdata = X.template data<T>();
-  T* col_buffer_data = col_buffer_.template mutable_data<T>();
   T* Ydata = Y->template mutable_data<T>();
-  // Im2col, followed by gemm.
-  for (int image_id = 0; image_id < N; ++image_id) {
-    math::Im2col<T, Context, StorageOrder::NCHW>(
-        Xdata,
-        C,
-        H,
-        W,
-        kernel_h_,
-        kernel_w_,
-        dilation_h_,
-        dilation_w_,
-        pad_t_,
-        pad_l_,
-        pad_b_,
-        pad_r_,
-        stride_h_,
-        stride_w_,
-        col_buffer_data,
-        &context_);
-    // Weight term
-    math::Gemm<T, Context>(
-        CblasNoTrans, CblasNoTrans, M, output_image_size, kernel_dim,
-        1, filter.template data<T>(), col_buffer_data,
-        0, Ydata,
-        &context_);
-    // Bias term
-    math::Gemm<T, Context>(
-        CblasNoTrans, CblasNoTrans, M, output_image_size, 1, 1,
-        bias.template data<T>(), bias_multiplier_.template data<T>(),
-        1, Ydata,
-        &context_);
-    Xdata += input_offset;
-    Ydata += output_offset;
+
+  auto f = [&](Tensor<Context>* col_buffer) {
+    col_buffer->Resize(
+        vector<TIndex>{C, kernel_h_, kernel_w_, Y->dim32(2), Y->dim32(3)});
+
+    T* col_buffer_data = col_buffer->template mutable_data<T>();
+    // Im2col, followed by gemm.
+    for (int image_id = 0; image_id < N; ++image_id) {
+      math::Im2col<T, Context, StorageOrder::NCHW>(
+          Xdata,
+          C,
+          H,
+          W,
+          kernel_h_,
+          kernel_w_,
+          dilation_h_,
+          dilation_w_,
+          pad_t_,
+          pad_l_,
+          pad_b_,
+          pad_r_,
+          stride_h_,
+          stride_w_,
+          col_buffer_data,
+          &context_);
+      // Weight term
+      math::Gemm<T, Context>(
+          CblasNoTrans,
+          CblasNoTrans,
+          M,
+          output_image_size,
+          kernel_dim,
+          1,
+          filter.template data<T>(),
+          col_buffer_data,
+          0,
+          Ydata,
+          &context_);
+      // Bias term
+      math::Gemm<T, Context>(
+          CblasNoTrans,
+          CblasNoTrans,
+          M,
+          output_image_size,
+          1,
+          1,
+          bias.template data<T>(),
+          bias_multiplier_.template data<T>(),
+          1,
+          Ydata,
+          &context_);
+      Xdata += input_offset;
+      Ydata += output_offset;
+    }
+  };
+
+  if (FLAGS_caffe2_force_shared_col_buffer || shared_buffer_) {
+    runWithSharedBuffer<Context>(ws_, f);
+  } else {
+    f(&col_buffer_);
   }
   return true;
 }
@@ -88,10 +123,10 @@
 // The implementations.
 template <typename T, class Context>
 bool ConvOp<T, Context>::RunOnDeviceWithOrderNHWC() {
-  auto& X = Input(INPUT);
+  const Tensor<Context>& X = Input(INPUT);
   auto& filter = Input(FILTER);
   auto& bias = Input(BIAS);
-  auto* Y = Output(0);
+  Tensor<Context>* Y = Output(0);
   const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), C = X.dim32(3);
   CAFFE_ENFORCE(4 == filter.ndim());
   const int M = filter.dim32(0);
@@ -147,41 +182,64 @@
           output_image_size, static_cast<T>(1),
           bias_multiplier_.template mutable_data<T>(), &context_);
     }
-    col_buffer_.Resize(vector<TIndex>{
-        Y->dim32(1), Y->dim32(2), kernel_h_, kernel_w_, C});
-    T* col_buffer_data = col_buffer_.template mutable_data<T>();
-    // Im2col, followed by gemm.
-    for (int image_id = 0; image_id < N; ++image_id) {
-      math::Im2col<T, Context, StorageOrder::NHWC>(
-          Xdata,
-          C,
-          H,
-          W,
-          kernel_h_,
-          kernel_w_,
-          dilation_h_,
-          dilation_w_,
-          pad_t_,
-          pad_l_,
-          pad_b_,
-          pad_r_,
-          stride_h_,
-          stride_w_,
-          col_buffer_data,
-          &context_);
-      // Weight term
-      // Wait, is this right....?
-      math::Gemm<T, Context>(
-          CblasNoTrans, CblasTrans, output_image_size, M, kernel_dim,
-          1, col_buffer_data, filter.template data<T>(), 0, Ydata,
-          &context_);
-      // Bias term
-      math::Gemm<T, Context>(
-          CblasNoTrans, CblasNoTrans, output_image_size, M, 1, 1,
-          bias_multiplier_.template data<T>(), bias.template data<T>(), 1,
-          Ydata, &context_);
-      Xdata += input_offset;
-      Ydata += output_offset;
+    auto f = [&](Tensor<Context>* col_buffer) {
+      col_buffer->Resize(
+          vector<TIndex>{Y->dim32(1), Y->dim32(2), kernel_h_, kernel_w_, C});
+      T* col_buffer_data = col_buffer->template mutable_data<T>();
+      // Im2col, followed by gemm.
+      for (int image_id = 0; image_id < N; ++image_id) {
+        math::Im2col<T, Context, StorageOrder::NHWC>(
+            Xdata,
+            C,
+            H,
+            W,
+            kernel_h_,
+            kernel_w_,
+            dilation_h_,
+            dilation_w_,
+            pad_t_,
+            pad_l_,
+            pad_b_,
+            pad_r_,
+            stride_h_,
+            stride_w_,
+            col_buffer_data,
+            &context_);
+        // Weight term
+        // Wait, is this right....?
+        math::Gemm<T, Context>(
+            CblasNoTrans,
+            CblasTrans,
+            output_image_size,
+            M,
+            kernel_dim,
+            1,
+            col_buffer_data,
+            filter.template data<T>(),
+            0,
+            Ydata,
+            &context_);
+        // Bias term
+        math::Gemm<T, Context>(
+            CblasNoTrans,
+            CblasNoTrans,
+            output_image_size,
+            M,
+            1,
+            1,
+            bias_multiplier_.template data<T>(),
+            bias.template data<T>(),
+            1,
+            Ydata,
+            &context_);
+        Xdata += input_offset;
+        Ydata += output_offset;
+      }
+    };
+    if (FLAGS_caffe2_force_shared_col_buffer || shared_buffer_) {
+      runWithSharedBuffer<Context>(ws_, f);
+    } else {
+      f(&col_buffer_);
     }
   }
   return true;
diff --git a/caffe2/operators/conv_op_shared.cc b/caffe2/operators/conv_op_shared.cc
new file mode 100644
index 0000000..acb29ec
--- /dev/null
+++ b/caffe2/operators/conv_op_shared.cc
@@ -0,0 +1,23 @@
+#include "conv_op_shared.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/flags.h"
+#include "caffe2/core/workspace.h"
+
+CAFFE2_DEFINE_bool(
+    caffe2_force_shared_col_buffer,
+    false,
+    "Always use the shared col buffer");
+
+namespace caffe2 {
+
+template <>
+void runWithSharedBuffer(
+    Workspace* ws,
+    std::function<void(Tensor<CPUContext>* buffer)> f) {
+  static std::mutex m;
+  std::lock_guard<std::mutex> g(m);
+  auto* buffer = ws->CreateBlob("__CAFFE2_SHARED_CONV_BUFFER_CPU__")
+                     ->GetMutable<TensorCPU>();
+  f(buffer);
+}
+}
diff --git a/caffe2/operators/conv_op_shared.h b/caffe2/operators/conv_op_shared.h
new file mode 100644
index 0000000..939f590
--- /dev/null
+++ b/caffe2/operators/conv_op_shared.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/core/workspace.h"
+
+namespace caffe2 {
+
+template <typename Context>
+void runWithSharedBuffer(
+    Workspace* ws,
+    std::function<void(Tensor<Context>* buffer)> f);
+}
diff --git a/caffe2/operators/conv_op_shared_gpu.cc b/caffe2/operators/conv_op_shared_gpu.cc
new file mode 100644
index 0000000..eed549f
--- /dev/null
+++ b/caffe2/operators/conv_op_shared_gpu.cc
@@ -0,0 +1,16 @@
+#include "caffe2/core/context_gpu.h"
+#include "conv_op_shared.h"
+
+namespace caffe2 {
+
+template <>
+void runWithSharedBuffer(
+    Workspace* ws,
+    std::function<void(Tensor<CUDAContext>* buffer)> f) {
+  static std::mutex m;
+  std::lock_guard<std::mutex> g(m);
+  auto* buffer = ws->CreateBlob("__CAFFE2_SHARED_CONV_BUFFER_CUDA__")
+                     ->GetMutable<TensorCUDA>();
+  f(buffer);
+}
+}
diff --git a/caffe2/operators/conv_pool_op_base.h b/caffe2/operators/conv_pool_op_base.h
index f465331..c83de71 100644
--- a/caffe2/operators/conv_pool_op_base.h
+++ b/caffe2/operators/conv_pool_op_base.h
@@ -56,7 +56,10 @@
             "stride_w",
             OperatorBase::GetSingleArgument<int>("stride", 1))),
         order_(StringToStorageOrder(
-            OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
+            OperatorBase::GetSingleArgument<string>("order", "NCHW"))),
+        shared_buffer_(
+            OperatorBase::GetSingleArgument<int>("shared_buffer", 0)),
+        ws_(ws) {
     // For the padding, they should either be the legacy padding strategy
     // (VALID or SAME), or an explicit, non-negative value.
     if (legacy_pad_ == LegacyPadding::VALID ||
@@ -196,6 +199,8 @@
   int stride_h_;
   int stride_w_;
   StorageOrder order_;
+  bool shared_buffer_;
+  Workspace* ws_;
 
   inline void ComputeSizeAndPad(
       const int in_size,
@@ -280,20 +285,22 @@
  private:
 };
 
-#define USE_CONV_POOL_BASE_FUNCTIONS(Context) \
-  USE_OPERATOR_FUNCTIONS(Context);            \
-  using ConvPoolOpBase<Context>::pad_t_;      \
-  using ConvPoolOpBase<Context>::pad_l_;      \
-  using ConvPoolOpBase<Context>::pad_b_;      \
-  using ConvPoolOpBase<Context>::pad_r_;      \
-  using ConvPoolOpBase<Context>::legacy_pad_; \
-  using ConvPoolOpBase<Context>::kernel_h_;   \
-  using ConvPoolOpBase<Context>::kernel_w_;   \
-  using ConvPoolOpBase<Context>::dilation_h_; \
-  using ConvPoolOpBase<Context>::dilation_w_; \
-  using ConvPoolOpBase<Context>::stride_h_;   \
-  using ConvPoolOpBase<Context>::stride_w_;   \
-  using ConvPoolOpBase<Context>::order_
+#define USE_CONV_POOL_BASE_FUNCTIONS(Context)    \
+  USE_OPERATOR_FUNCTIONS(Context);               \
+  using ConvPoolOpBase<Context>::pad_t_;         \
+  using ConvPoolOpBase<Context>::pad_l_;         \
+  using ConvPoolOpBase<Context>::pad_b_;         \
+  using ConvPoolOpBase<Context>::pad_r_;         \
+  using ConvPoolOpBase<Context>::legacy_pad_;    \
+  using ConvPoolOpBase<Context>::kernel_h_;      \
+  using ConvPoolOpBase<Context>::kernel_w_;      \
+  using ConvPoolOpBase<Context>::dilation_h_;    \
+  using ConvPoolOpBase<Context>::dilation_w_;    \
+  using ConvPoolOpBase<Context>::stride_h_;      \
+  using ConvPoolOpBase<Context>::stride_w_;      \
+  using ConvPoolOpBase<Context>::order_;         \
+  using ConvPoolOpBase<Context>::shared_buffer_; \
+  using ConvPoolOpBase<Context>::ws_
 
 }  // namespace caffe2
 
diff --git a/caffe2/operators/conv_transpose_op_cudnn.cc b/caffe2/operators/conv_transpose_op_cudnn.cc
index dbcef28..a292e49 100644
--- a/caffe2/operators/conv_transpose_op_cudnn.cc
+++ b/caffe2/operators/conv_transpose_op_cudnn.cc
@@ -45,7 +45,7 @@
         deterministic_(
             OperatorBase::GetSingleArgument<int>("deterministic", 0)),
         cudnn_state_(OperatorBase::GetSingleArgument<int>("cudnn_state", 0)) {
-    CHECK(!deterministic_ || !exhaustive_search_);
+    CAFFE_ENFORCE(!deterministic_ || !exhaustive_search_);
     CUDNN_CHECK(cudnnCreateTensorDescriptor(&bottom_desc_));
     CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
     CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc_));
diff --git a/caffe2/operators/conv_transpose_op_impl.h b/caffe2/operators/conv_transpose_op_impl.h
index ed5016e..64a573c 100644
--- a/caffe2/operators/conv_transpose_op_impl.h
+++ b/caffe2/operators/conv_transpose_op_impl.h
@@ -6,18 +6,21 @@
 #include "caffe2/core/context.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
+#include "caffe2/operators/conv_op_shared.h"
 #include "caffe2/operators/conv_transpose_op.h"
 #include "caffe2/operators/conv_transpose_unpool_op_base.h"
 #include "caffe2/utils/math.h"
 
+CAFFE2_DECLARE_bool(caffe2_force_shared_col_buffer);
+
 namespace caffe2 {
 
 template <typename T, class Context>
 bool ConvTransposeOp<T, Context>::RunOnDeviceWithOrderNCHW() {
-  auto& X = Input(INPUT);
+  const Tensor<Context>& X = Input(INPUT);
   auto& filter = Input(FILTER);
   auto& bias = Input(BIAS);
-  auto* Y = Output(0);
+  Tensor<Context>* Y = Output(0);
   const int N = X.dim32(0), M = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
   CAFFE_ENFORCE(filter.ndim() == 4, "filter must be 4D tensor");
   CAFFE_ENFORCE(
@@ -40,7 +43,6 @@
   const int input_image_size = H * W;
   const int output_image_size = Y->dim32(2) * Y->dim32(3);
 
-  col_buffer_.Resize(vector<TIndex>{C, kernel_h_, kernel_w_, H, W});
   if (bias_multiplier_.size() != output_image_size) {
     bias_multiplier_.Resize(vector<TIndex>(1, output_image_size));
     math::Set<T, Context>(
@@ -50,65 +52,74 @@
         &context_);
   }
   const T* Xdata = X.template data<T>();
-  T* col_buffer_data = col_buffer_.template mutable_data<T>();
   T* Ydata = Y->template mutable_data<T>();
-  for (auto image_id = 0; image_id < N; ++image_id) {
-    // Weight term
-    math::Gemm<T, Context>(
-        CblasTrans,
-        CblasNoTrans,
-        kernel_dim,
-        input_image_size,
-        M,
-        1,
-        filter.template data<T>(),
-        Xdata,
-        0,
-        col_buffer_data,
-        &context_);
-    // Col2im
-    math::Col2im<T, Context, StorageOrder::NCHW>(
-        col_buffer_data,
-        C,
-        Y->dim32(2),
-        Y->dim32(3),
-        kernel_h_,
-        kernel_w_,
-        1,
-        1,
-        pad_t_,
-        pad_l_,
-        pad_b_,
-        pad_r_,
-        stride_h_,
-        stride_w_,
-        Ydata,
-        &context_);
-    // Bias term
-    math::Gemm<T, Context>(
-        CblasNoTrans,
-        CblasNoTrans,
-        C,
-        output_image_size,
-        1,
-        1,
-        bias.template data<T>(),
-        bias_multiplier_.template data<T>(),
-        1,
-        Ydata,
-        &context_);
-    Xdata += M * H * W;
-    Ydata += Y->size() / Y->dim32(0);
+
+  auto f = [&](Tensor<Context>* col_buffer) {
+    col_buffer->Resize(vector<TIndex>{C, kernel_h_, kernel_w_, H, W});
+    T* col_buffer_data = col_buffer->template mutable_data<T>();
+    for (auto image_id = 0; image_id < N; ++image_id) {
+      // Weight term
+      math::Gemm<T, Context>(
+          CblasTrans,
+          CblasNoTrans,
+          kernel_dim,
+          input_image_size,
+          M,
+          1,
+          filter.template data<T>(),
+          Xdata,
+          0,
+          col_buffer_data,
+          &context_);
+      // Col2im
+      math::Col2im<T, Context, StorageOrder::NCHW>(
+          col_buffer_data,
+          C,
+          Y->dim32(2),
+          Y->dim32(3),
+          kernel_h_,
+          kernel_w_,
+          1,
+          1,
+          pad_t_,
+          pad_l_,
+          pad_b_,
+          pad_r_,
+          stride_h_,
+          stride_w_,
+          Ydata,
+          &context_);
+      // Bias term
+      math::Gemm<T, Context>(
+          CblasNoTrans,
+          CblasNoTrans,
+          C,
+          output_image_size,
+          1,
+          1,
+          bias.template data<T>(),
+          bias_multiplier_.template data<T>(),
+          1,
+          Ydata,
+          &context_);
+      Xdata += M * H * W;
+      Ydata += Y->size() / Y->dim32(0);
+    }
+  };
+  if (FLAGS_caffe2_force_shared_col_buffer || shared_buffer_) {
+    runWithSharedBuffer<Context>(ws_, f);
+  } else {
+    f(&col_buffer_);
   }
   return true;
 }
 
 template <typename T, class Context>
 bool ConvTransposeOp<T, Context>::RunOnDeviceWithOrderNHWC() {
-  auto& X = Input(INPUT);
+  const Tensor<Context>& X = Input(INPUT);
   auto& filter = Input(FILTER);
   auto& bias = Input(BIAS);
-  auto* Y = Output(0);
+  Tensor<Context>* Y = Output(0);
   const auto N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), M = X.dim32(3);
   CAFFE_ENFORCE(filter.ndim() == 4, "filter must be 4D tensor");
   CAFFE_ENFORCE(
@@ -131,7 +142,6 @@
   const auto input_image_size = H * W;
   const auto output_image_size = Y->dim32(1) * Y->dim32(2);
 
-  col_buffer_.Resize(vector<TIndex>{H, W, kernel_h_, kernel_w_, C});
   if (bias_multiplier_.size() != output_image_size) {
     bias_multiplier_.Resize(vector<TIndex>(1, output_image_size));
     math::Set<T, Context>(
@@ -141,55 +151,64 @@
         &context_);
   }
   const T* Xdata = X.template data<T>();
-  T* col_buffer_data = col_buffer_.template mutable_data<T>();
   T* Ydata = Y->template mutable_data<T>();
-  for (auto image_id = 0; image_id < N; ++image_id) {
-    // Weight term
-    math::Gemm<T, Context>(
-        CblasNoTrans,
-        CblasNoTrans,
-        input_image_size,
-        kernel_dim,
-        M,
-        1,
-        Xdata,
-        filter.template data<T>(),
-        0,
-        col_buffer_data,
-        &context_);
-    // Col2im
-    math::Col2im<T, Context, StorageOrder::NHWC>(
-        col_buffer_data,
-        C,
-        Y->dim32(1),
-        Y->dim32(2),
-        kernel_h_,
-        kernel_w_,
-        1,
-        1,
-        pad_t_,
-        pad_l_,
-        pad_b_,
-        pad_r_,
-        stride_h_,
-        stride_w_,
-        Ydata,
-        &context_);
-    // Bias term
-    math::Gemm<T, Context>(
-        CblasNoTrans,
-        CblasNoTrans,
-        output_image_size,
-        C,
-        1,
-        1,
-        bias_multiplier_.template data<T>(),
-        bias.template data<T>(),
-        1,
-        Ydata,
-        &context_);
-    Xdata += M * H * W;
-    Ydata += Y->size() / Y->dim32(0);
+
+  auto f = [&](Tensor<Context>* col_buffer) {
+    col_buffer_.Resize(vector<TIndex>{H, W, kernel_h_, kernel_w_, C});
+    T* col_buffer_data = col_buffer_.template mutable_data<T>();
+    for (auto image_id = 0; image_id < N; ++image_id) {
+      // Weight term
+      math::Gemm<T, Context>(
+          CblasNoTrans,
+          CblasNoTrans,
+          input_image_size,
+          kernel_dim,
+          M,
+          1,
+          Xdata,
+          filter.template data<T>(),
+          0,
+          col_buffer_data,
+          &context_);
+      // Col2im
+      math::Col2im<T, Context, StorageOrder::NHWC>(
+          col_buffer_data,
+          C,
+          Y->dim32(1),
+          Y->dim32(2),
+          kernel_h_,
+          kernel_w_,
+          1,
+          1,
+          pad_t_,
+          pad_l_,
+          pad_b_,
+          pad_r_,
+          stride_h_,
+          stride_w_,
+          Ydata,
+          &context_);
+      // Bias term
+      math::Gemm<T, Context>(
+          CblasNoTrans,
+          CblasNoTrans,
+          output_image_size,
+          C,
+          1,
+          1,
+          bias_multiplier_.template data<T>(),
+          bias.template data<T>(),
+          1,
+          Ydata,
+          &context_);
+      Xdata += M * H * W;
+      Ydata += Y->size() / Y->dim32(0);
+    }
+  };
+  if (FLAGS_caffe2_force_shared_col_buffer || shared_buffer_) {
+    runWithSharedBuffer<Context>(ws_, f);
+  } else {
+    f(&col_buffer_);
   }
   return true;
 }
diff --git a/caffe2/operators/conv_transpose_unpool_op_base.h b/caffe2/operators/conv_transpose_unpool_op_base.h
index 59aad86..675c150 100644
--- a/caffe2/operators/conv_transpose_unpool_op_base.h
+++ b/caffe2/operators/conv_transpose_unpool_op_base.h
@@ -44,7 +44,10 @@
             "adj_w",
             OperatorBase::GetSingleArgument<int>("adj", 0))),
         order_(StringToStorageOrder(
-            OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
+            OperatorBase::GetSingleArgument<string>("order", "NCHW"))),
+        shared_buffer_(
+            OperatorBase::GetSingleArgument<int>("shared_buffer", 0)),
+        ws_(ws) {
     CAFFE_ENFORCE(kernel_h_ > 0);
     CAFFE_ENFORCE(kernel_w_ > 0);
     // For the padding, they should either be the legacy padding strategy
@@ -151,6 +154,8 @@
   int adj_h_;
   int adj_w_;
   StorageOrder order_;
+  bool shared_buffer_;
+  Workspace* ws_;
 
   inline void ComputeSizeAndPad(
       const int in_size,
@@ -182,17 +187,19 @@
   }
 };
 
-#define USE_CONV_TRANSPOSE_UNPOOL_BASE_FUNCTIONS     \
-  USE_OPERATOR_CONTEXT_FUNCTIONS;                    \
-  using ConvTransposeUnpoolBase<Context>::pad_t_;    \
-  using ConvTransposeUnpoolBase<Context>::pad_b_;    \
-  using ConvTransposeUnpoolBase<Context>::pad_l_;    \
-  using ConvTransposeUnpoolBase<Context>::pad_r_;    \
-  using ConvTransposeUnpoolBase<Context>::kernel_h_; \
-  using ConvTransposeUnpoolBase<Context>::kernel_w_; \
-  using ConvTransposeUnpoolBase<Context>::stride_h_; \
-  using ConvTransposeUnpoolBase<Context>::stride_w_; \
-  using ConvTransposeUnpoolBase<Context>::order_
+#define USE_CONV_TRANSPOSE_UNPOOL_BASE_FUNCTIONS          \
+  USE_OPERATOR_CONTEXT_FUNCTIONS;                         \
+  using ConvTransposeUnpoolBase<Context>::pad_t_;         \
+  using ConvTransposeUnpoolBase<Context>::pad_b_;         \
+  using ConvTransposeUnpoolBase<Context>::pad_l_;         \
+  using ConvTransposeUnpoolBase<Context>::pad_r_;         \
+  using ConvTransposeUnpoolBase<Context>::kernel_h_;      \
+  using ConvTransposeUnpoolBase<Context>::kernel_w_;      \
+  using ConvTransposeUnpoolBase<Context>::stride_h_;      \
+  using ConvTransposeUnpoolBase<Context>::stride_w_;      \
+  using ConvTransposeUnpoolBase<Context>::order_;         \
+  using ConvTransposeUnpoolBase<Context>::shared_buffer_; \
+  using ConvTransposeUnpoolBase<Context>::ws_
 
 } // namespace caffe2
 
diff --git a/caffe2/operators/counter_ops.cc b/caffe2/operators/counter_ops.cc
index 8ce88f8..511fa97 100644
--- a/caffe2/operators/counter_ops.cc
+++ b/caffe2/operators/counter_ops.cc
@@ -8,6 +8,9 @@
 REGISTER_CPU_OPERATOR(CreateCounter, CreateCounterOp<int64_t, CPUContext>);
 REGISTER_CPU_OPERATOR(ResetCounter, ResetCounterOp<int64_t, CPUContext>);
 REGISTER_CPU_OPERATOR(CountDown, CountDownOp<int64_t, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    CheckCounterDone,
+    CheckCounterDoneOp<int64_t, CPUContext>);
 REGISTER_CPU_OPERATOR(CountUp, CountUpOp<int64_t, CPUContext>);
 REGISTER_CPU_OPERATOR(RetrieveCount, RetrieveCountOp<int64_t, CPUContext>);
 
@@ -41,6 +44,15 @@
     .Input(0, "counter", "A blob pointing to an instance of a counter.")
     .Output(0, "done", "false unless the internal count is zero.");
 
+OPERATOR_SCHEMA(CheckCounterDone)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+If the internal count value <= 0, outputs true, otherwise outputs false,
+)DOC")
+    .Input(0, "counter", "A blob pointing to an instance of a counter.")
+    .Output(0, "done", "true if the internal count is zero or negative.");
+
 OPERATOR_SCHEMA(CountUp)
     .NumInputs(1)
     .NumOutputs(1)
@@ -67,4 +79,6 @@
 
 } // namespace
 
+CAFFE_KNOWN_TYPE(std::unique_ptr<Counter<int64_t>>);
+
 } // namespace caffe2
diff --git a/caffe2/operators/counter_ops.h b/caffe2/operators/counter_ops.h
index 73a91e0..cd939b2 100644
--- a/caffe2/operators/counter_ops.h
+++ b/caffe2/operators/counter_ops.h
@@ -8,7 +8,6 @@
 #include "caffe2/core/operator.h"
 
 namespace caffe2 {
-namespace {
 template <typename T>
 class Counter {
  public:
@@ -28,6 +27,10 @@
     return count_.load();
   }
 
+  T checkIfDone() const {
+    return (count_.load() <= 0);
+  }
+
   void reset(T init_count) {
     count_ = init_count;
   }
@@ -35,7 +38,6 @@
  private:
   std::atomic<T> count_;
 };
-}
 
 // TODO(jiayq): deprecate these ops & consolidate them with IterOp/AtomicIterOp
 
@@ -98,6 +100,23 @@
 
 // Will always use TensorCPU regardless the Context
 template <typename T, class Context>
+class CheckCounterDoneOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  CheckCounterDoneOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    auto& counterPtr = OperatorBase::Input<std::unique_ptr<Counter<T>>>(0);
+    auto* output = OperatorBase::Output<TensorCPU>(0);
+    output->Resize(std::vector<int>{});
+    *output->template mutable_data<bool>() = counterPtr->checkIfDone();
+    return true;
+  }
+};
+
+// Will always use TensorCPU regardless the Context
+template <typename T, class Context>
 class CountUpOp final : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
diff --git a/caffe2/operators/counter_ops_gpu.cc b/caffe2/operators/counter_ops_gpu.cc
index cda6740..de07e02 100644
--- a/caffe2/operators/counter_ops_gpu.cc
+++ b/caffe2/operators/counter_ops_gpu.cc
@@ -6,6 +6,9 @@
 REGISTER_CUDA_OPERATOR(CreateCounter, CreateCounterOp<int64_t, CUDAContext>);
 REGISTER_CUDA_OPERATOR(ResetCounter, ResetCounterOp<int64_t, CUDAContext>);
 REGISTER_CUDA_OPERATOR(CountDown, CountDownOp<int64_t, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    CheckCounterDone,
+    CheckCounterDoneOp<int64_t, CUDAContext>);
 REGISTER_CUDA_OPERATOR(CountUp, CountUpOp<int64_t, CUDAContext>);
 REGISTER_CUDA_OPERATOR(RetrieveCount, RetrieveCountOp<int64_t, CUDAContext>);
 } // namespace
diff --git a/caffe2/operators/dataset_ops.cc b/caffe2/operators/dataset_ops.cc
index 831530a..2b1b0e1 100644
--- a/caffe2/operators/dataset_ops.cc
+++ b/caffe2/operators/dataset_ops.cc
@@ -634,6 +634,144 @@
   }
 };
 
+template <class Context>
+using TensorVectorPtr = std::unique_ptr<std::vector<Tensor<Context>>>;
+
+template <class Context>
+class CreateTensorVectorOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  using Operator<Context>::Operator;
+
+  bool RunOnDevice() override {
+    auto ptr = std::make_unique<std::vector<Tensor<Context>>>();
+    *OperatorBase::Output<TensorVectorPtr<Context>>(TENSOR_VECTOR) =
+        std::move(ptr);
+    return true;
+  }
+
+ private:
+  OUTPUT_TAGS(TENSOR_VECTOR);
+};
+
+template <class Context>
+class ConcatTensorVectorOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  using Operator<Context>::Operator;
+
+  bool RunOnDevice() override {
+    const TensorVectorPtr<Context>& tensorVector =
+        OperatorBase::Input<TensorVectorPtr<Context>>(TENSOR_VECTOR);
+
+    auto* tensor = Output(TENSOR);
+    CAFFE_ENFORCE(!tensorVector->empty());
+
+    vector<TIndex> outputDims(tensorVector->at(0).dims());
+    CAFFE_ENFORCE(outputDims.size() > 0);
+    for (int i = 1; i < tensorVector->size(); i++) {
+      // the tensor shapes are the same except for the first dimension
+      for (int j = 1; j < tensorVector->at(i).ndim(); j++) {
+        CAFFE_ENFORCE(outputDims[j] == tensorVector->at(i).dims()[j]);
+      }
+      CAFFE_ENFORCE(tensorVector->at(0).meta() == tensorVector->at(i).meta());
+      outputDims[0] += tensorVector->at(i).dims()[0];
+    }
+
+    tensor->Resize(outputDims);
+    TIndex offset = 0;
+    auto* dst = (char*)tensor->raw_mutable_data(tensorVector->at(0).meta());
+
+    for (const auto& t : *tensorVector) {
+      context_.template CopyItems<Context, Context>(
+          t.meta(), t.size(), t.raw_data(), dst + offset);
+      offset += t.nbytes();
+    }
+
+    return true;
+  }
+
+ private:
+  INPUT_TAGS(TENSOR_VECTOR);
+  OUTPUT_TAGS(TENSOR);
+};
+
+template <class Context>
+class CollectTensorOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  CollectTensorOp(const OperatorDef operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        numToCollect_(
+            OperatorBase::GetSingleArgument<int>("num_to_collect", -1)),
+        numVisited_(0) {
+    CAFFE_ENFORCE(numToCollect_ > 0);
+  }
+
+  bool RunOnDevice() override {
+    // TENSOR_VECTOR_IN is enforced inplace with TENSOR_VECTOR_OUT
+    TensorVectorPtr<Context>& tensorVector =
+        *OperatorBase::Output<TensorVectorPtr<Context>>(TENSOR_VECTOR_OUT);
+
+    auto* position_out = Output(POSITION_OUT);
+    const auto& tensor = Input(TENSOR_TO_COLLECT);
+
+    int pos = -1;
+    if (InputSize() >= 3) {
+      CAFFE_ENFORCE(0 == Input(POSITION_IN).ndim());
+      pos = Input(POSITION_IN).template data<int>()[0];
+    } else {
+      if (numVisited_ < numToCollect_) {
+        // append
+        pos = tensorVector->size();
+      } else {
+        CAFFE_ENFORCE(
+            tensorVector->size() == numToCollect_,
+            "TensorVecotor size = ",
+            tensorVector->size(),
+            " is different from numToCollect = ",
+            numToCollect_);
+        auto& gen = context_.RandGenerator();
+        // uniform between [0, numVisited_]
+        std::uniform_int_distribution<int> uniformDist(0, numVisited_);
+        pos = uniformDist(gen);
+        if (pos >= numToCollect_) {
+          // discard
+          pos = -1;
+        }
+      }
+    }
+
+    if (pos < 0) {
+      // discard
+      CAFFE_ENFORCE(numVisited_ >= numToCollect_);
+    } else if (pos >= tensorVector->size()) {
+      // append
+      tensorVector->push_back(Tensor<Context>());
+      tensorVector->back().template CopyFrom<Context, Context>(
+          tensor, &context_);
+    } else {
+      // replace
+      tensorVector->at(pos).template CopyFrom<Context, Context>(
+          tensor, &context_);
+    }
+
+    position_out->Resize(vector<TIndex>());
+    position_out->template mutable_data<int>()[0] = pos;
+
+    numVisited_++;
+    return true;
+  }
+
+ private:
+  // number of tensors to collect
+  int numToCollect_;
+  // number of tensors visited
+  int numVisited_;
+  INPUT_TAGS(TENSOR_VECTOR_IN, TENSOR_TO_COLLECT, POSITION_IN);
+  OUTPUT_TAGS(TENSOR_VECTOR_OUT, POSITION_OUT);
+};
+
 REGISTER_CPU_OPERATOR(CreateTreeCursor, CreateTreeCursorOp);
 REGISTER_CPU_OPERATOR(ResetCursor, ResetCursorOp);
 REGISTER_CPU_OPERATOR(ReadNextBatch, ReadNextBatchOp);
@@ -643,6 +781,9 @@
 REGISTER_CPU_OPERATOR(CheckDatasetConsistency, CheckDatasetConsistencyOp);
 REGISTER_CPU_OPERATOR(Append, AppendOp<CPUContext>);
 REGISTER_CPU_OPERATOR(AtomicAppend, AtomicAppendOp<CPUContext>);
+REGISTER_CPU_OPERATOR(CreateTensorVector, CreateTensorVectorOp<CPUContext>);
+REGISTER_CPU_OPERATOR(ConcatTensorVector, ConcatTensorVectorOp<CPUContext>);
+REGISTER_CPU_OPERATOR(CollectTensor, CollectTensorOp<CPUContext>);
 
 OPERATOR_SCHEMA(CreateTreeCursor)
     .NumInputs(0)
@@ -850,6 +991,45 @@
     .NumOutputs(1, INT_MAX)
     .AllowInplace([](int in, int out) { return in == out + 1; });
 
+OPERATOR_SCHEMA(CreateTensorVector)
+    .NumInputs(0)
+    .NumOutputs(1)
+    .SetDoc("Create a std::unique_ptr<std::vector<Tensor> >");
+
+OPERATOR_SCHEMA(ConcatTensorVector)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Concat Tensors in the std::unique_ptr<std::vector<Tensor> >
+along the first dimension.
+    )DOC")
+    .Input(0, "vector of Tensor", "std::unique_ptr<std::vector<Tensor> >")
+    .Output(0, "tensor", "tensor after concatenating");
+
+OPERATOR_SCHEMA(CollectTensor)
+    .NumInputs(2, 3)
+    .NumOutputs(2)
+    .EnforceInplace({{0, 0}})
+    .AllowInplace({{2, 1}})
+    .SetDoc(R"DOC(
+Collect tensor into tensor vector by reservoir sampling,
+argument num_to_collect indicates the max number of tensors that will be
+collcted
+  )DOC")
+    .Arg("num_to_collect", "The max number of tensors to collect")
+    .Input(0, "input tensor vector", "tensor vector with collected tensors")
+    .Input(1, "tensor", "new tensor will be collected by reservoir sampling")
+    .Input(2, "input position", R"DOC(
+if provided, new tensor will be collected in the way indicated by position.
+e.g. if position < 0, discard the new tensor, if position == k and k < the size
+of input tensor vector, replace the tensor at position k with the new tensor.
+    )DOC")
+    .Output(0, "output tensor vector", "enforce inplace with input 0")
+    .Output(1, "output position", R"DOC(
+record the position at which the new tensor was collcted,
+position < 0 means it's discarded.
+    )DOC");
+
 SHOULD_NOT_DO_GRADIENT(CreateTreeCursor);
 SHOULD_NOT_DO_GRADIENT(ResetCursor);
 SHOULD_NOT_DO_GRADIENT(ReadNextBatch);
@@ -858,5 +1038,10 @@
 SHOULD_NOT_DO_GRADIENT(CheckDatasetConsistency);
 SHOULD_NOT_DO_GRADIENT(Append);
 SHOULD_NOT_DO_GRADIENT(AtomicAppend);
-}
-}
+SHOULD_NOT_DO_GRADIENT(CreateTensorVector);
+SHOULD_NOT_DO_GRADIENT(ConcatTensorVector);
+SHOULD_NOT_DO_GRADIENT(CollectTensor);
+} // namespace
+CAFFE_KNOWN_TYPE(std::unique_ptr<TreeCursor>);
+CAFFE_KNOWN_TYPE(TensorVectorPtr<CPUContext>);
+} // caffe2
diff --git a/caffe2/operators/filler_op.h b/caffe2/operators/filler_op.h
index 02d3d90..9098ee8 100644
--- a/caffe2/operators/filler_op.h
+++ b/caffe2/operators/filler_op.h
@@ -203,7 +203,7 @@
 
   bool Fill(Tensor<Context>* output) override {
     const int fan_in = output->size() / output->dim32(0);
-    T scale = sqrt(T(3) / fan_in);
+    T scale = std::sqrt(T(3) / fan_in);
     math::RandUniform<T, Context>(
         output->size(), -scale, scale,
         output->template mutable_data<T>(), &context_);
@@ -221,7 +221,7 @@
 
   bool Fill(Tensor<Context>* output) override {
     const int fan_in = output->size() / output->dim32(0);
-    T scale = sqrt(T(2) / fan_in);
+    T scale = std::sqrt(T(2) / fan_in);
     math::RandUniform<T, Context>(
         output->size(), -scale, scale,
         output->template mutable_data<T>(), &context_);
diff --git a/caffe2/operators/index_ops.cc b/caffe2/operators/index_ops.cc
index 5df4ce6..575b1e5 100644
--- a/caffe2/operators/index_ops.cc
+++ b/caffe2/operators/index_ops.cc
@@ -53,10 +53,10 @@
   explicit Index(TIndexValue maxElements)
     : IndexBase(maxElements, TypeMeta::Make<T>()) {}
 
-  bool Get(const T* keys, TIndexValue* values, size_t numKeys) {
+  void Get(const T* keys, TIndexValue* values, size_t numKeys) {
     if (frozen_) {
       FrozenGet(keys, values, numKeys);
-      return true;
+      return;
     }
     std::lock_guard<std::mutex> lock(dictMutex_);
     for (int i = 0; i < numKeys; ++i) {
@@ -68,10 +68,9 @@
         dict_.insert({keys[i], newValue});
         values[i] = newValue;
       } else {
-        return false;
+        CAFFE_THROW("Dict max size reached");
       }
     }
-    return true;
   }
 
   bool Load(const T* keys, size_t numKeys) {
@@ -152,8 +151,8 @@
     const auto& keys = Input(1);
     auto* values = Output(0);
     values->ResizeLike(keys);
-    return dict->Get(
-        keys.data<T>(), values->mutable_data<TIndexValue>(), keys.size());
+    dict->Get(keys.data<T>(), values->mutable_data<TIndexValue>(), keys.size());
+    return true;
   }
 };
 
@@ -431,6 +430,8 @@
   }
 };
 
+CAFFE_KNOWN_TYPE(std::unique_ptr<caffe2::IndexBase>);
+
 REGISTER_BLOB_SERIALIZER(
     (TypeMeta::Id<std::unique_ptr<caffe2::IndexBase>>()),
     IndexSerializer);
diff --git a/caffe2/operators/load_save_op.h b/caffe2/operators/load_save_op.h
index 8a750a9..fc1e23e 100644
--- a/caffe2/operators/load_save_op.h
+++ b/caffe2/operators/load_save_op.h
@@ -80,7 +80,7 @@
 
         VLOG(2) << "Deserializing blob " << key;
         BlobProto proto;
-        CHECK(proto.ParseFromString(cursor->value()));
+        CAFFE_ENFORCE(proto.ParseFromString(cursor->value()));
         if (!keep_device_) {
           // If we are not keeping the device as the one specified in the
           // proto, we will set the current device.
@@ -97,7 +97,7 @@
           // different GPU.
           blob->Reset();
         }
-        CHECK(blob->Deserialize(proto));
+        CAFFE_ENFORCE(blob->Deserialize(proto));
 
         if (!blob->IsType<Tensor<Context>>()) {
           // Deal with non-tensors: we don't support chunking so we're done.
@@ -110,7 +110,7 @@
             blobSize.first->second += proto.tensor().segment().end() -
                 proto.tensor().segment().begin();
           } else {
-            CHECK(blobSize.first->second == 0);
+            CAFFE_ENFORCE(blobSize.first->second == 0);
             blobSize.first->second = tensorSize;
           }
           if (blobSize.first->second >= tensorSize) {
@@ -137,7 +137,15 @@
       }
     }
 
-    CHECK_EQ(loaded.size(), OutputSize());
+    if (loaded.size() != OutputSize()) {
+      for (const string& output_name : this->def().output()) {
+        if (loaded.count(output_name) <= 0) {
+          LOG(ERROR) << "Failed to load blob: " << output_name;
+        }
+      }
+      CAFFE_THROW(
+          "Expected to load ", OutputSize(), " blobs, ", "got ", loaded.size());
+    }
   }
 
  private:
diff --git a/caffe2/operators/lstm_unit_op.h b/caffe2/operators/lstm_unit_op.h
index d4a77a7..97eced9 100644
--- a/caffe2/operators/lstm_unit_op.h
+++ b/caffe2/operators/lstm_unit_op.h
@@ -29,7 +29,7 @@
     T* H,
     Context* context) {
   for (int n = 0; n < N; ++n) {
-    const bool valid = seqLengths[n] < t;
+    const bool valid = t < seqLengths[n];
     for (int d = 0; d < D; ++d) {
       if (!valid) {
         H[d] = 0;
@@ -69,7 +69,7 @@
     T* X_diff,
     Context* context) {
   for (int n = 0; n < N; ++n) {
-    const bool valid = seqLengths[n] < t;
+    const bool valid = t < seqLengths[n];
     for (int d = 0; d < D; ++d) {
       T* c_prev_diff = C_prev_diff + d;
       T* i_diff = X_diff + d;
diff --git a/caffe2/operators/lstm_unit_op_gpu.cu b/caffe2/operators/lstm_unit_op_gpu.cu
index 2dae099..b21a62d 100644
--- a/caffe2/operators/lstm_unit_op_gpu.cu
+++ b/caffe2/operators/lstm_unit_op_gpu.cu
@@ -31,7 +31,7 @@
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
     const int n = index / dim;
     const int d = index % dim;
-    const bool valid = seqLengths[n] < t;
+    const bool valid = t < seqLengths[n];
     if (!valid) {
       H[index] = 0;
       C[index] = C_prev[index];
@@ -66,7 +66,7 @@
     T* X_diff) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
     const int n = index / dim;
-    const bool valid = seqLengths[n] < t;
+    const bool valid = t < seqLengths[n];
     const int d = index % dim;
     const T* X_offset = X + 4 * dim * n;
     T* c_prev_diff = C_prev_diff + index;
diff --git a/caffe2/operators/matmul_op.cu b/caffe2/operators/matmul_op_gpu.cc
similarity index 100%
rename from caffe2/operators/matmul_op.cu
rename to caffe2/operators/matmul_op_gpu.cc
diff --git a/caffe2/operators/pack_segments.cc b/caffe2/operators/pack_segments.cc
index 3d6227a..55333d3 100644
--- a/caffe2/operators/pack_segments.cc
+++ b/caffe2/operators/pack_segments.cc
@@ -14,9 +14,21 @@
 class PackSegmentsOp final : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
-  USE_SIMPLE_CTOR_DTOR(PackSegmentsOp)
+  // USE_SIMPLE_CTOR_DTOR(PackSegmentsOp)
   USE_DISPATCH_HELPER;
 
+  PackSegmentsOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+      pad_minf_(
+        OperatorBase::GetSingleArgument<bool>("pad_minf", false)) {
+          if (pad_minf_) {
+            padding_ = -1.0 * std::numeric_limits<float>::infinity();
+          } else {
+            padding_ = 0;
+          }
+        }
+
+
   bool RunOnDevice() override {
     return DispatchHelper<TensorTypes<int, long>>::call(this, Input(LENGTHS));
   }
@@ -30,17 +42,22 @@
     CAFFE_ENFORCE(data.ndim() >= 1, "DATA should be at least 1-D");
     CAFFE_ENFORCE(lengths.ndim() == 1, "LENGTH should be 1-D");
 
+    // Find the length of the longest sequence.
     const T* l = lengths.template data<T>();
-    T max_l = l[0];
+    T max_length = l[0];
     for (T i = 1; i < lengths.dim(0); ++i) {
-      max_l = std::max(max_l, l[i]);
+      max_length = std::max(max_length, l[i]);
     }
 
-    auto shape = data.dims();
-    shape[0] = max_l;
+    auto shape = data.dims(); // Shape of output is batch_size x max_len x ...
+    shape[0] = max_length;
     shape.insert(shape.begin(), lengths.size());
     output->Resize(shape);
 
+    // Do zero padding
+    float* data_ptr = output->template mutable_data<float>();
+    memset(data_ptr, padding_, sizeof(float) * output->size());
+
     int block_size = data.size() / data.dim(0);
     int block_bytesize = data.nbytes() / data.dim(0);
     const auto* d = static_cast<const char*>(data.raw_data());
@@ -51,13 +68,17 @@
           data.meta(),
           l[i] * block_size,
           d + block_bytesize * start,
-          out + block_bytesize * max_l * i);
+          out + block_bytesize * max_length * i);
       start += l[i];
     }
+
     return true;
   }
 
   INPUT_TAGS(LENGTHS, DATA);
+  private:
+    bool pad_minf_;
+    float padding_;
 };
 
 template <class Context>
@@ -82,9 +103,9 @@
 
     const T* l = lengths.template data<T>();
 
-    T max_l = l[0];
+    T max_length = l[0];
     for (T i = 1; i < lengths.dim(0); ++i) {
-      max_l = std::max(max_l, l[i]);
+      max_length = std::max(max_length, l[i]);
     }
     T total_l = std::accumulate(l, l + lengths.dim(0), 0);
 
@@ -119,7 +140,9 @@
 OPERATOR_SCHEMA(PackSegments)
     .NumInputs(2)
     .NumOutputs(1)
-    .SetDoc("Map N dim tensor to N+1 dim based on length blob")
+    .SetDoc(
+        "Map N dim tensor to N+1 dim based on length blob. Sequences that \
+    are shorter than the longest sequence are padded with zeros.")
     .Input(
         0,
         "lengths",
@@ -130,7 +153,10 @@
         "packed_tensor",
         "N + 1 dim Tesor"
         "where dim(1) is the max length"
-        ", dim(0) is the batch size.");
+        ", dim(0) is the batch size.")
+    .Arg(
+        "pad_minf", "Padding number in the packed segments. Use true to pad \
+    -infinity, otherwise pad zeros");
 OPERATOR_SCHEMA(UnpackSegments)
     .NumInputs(2)
     .NumOutputs(1)
diff --git a/caffe2/operators/packed_fc_op.cc b/caffe2/operators/packed_fc_op.cc
new file mode 100644
index 0000000..a4d392a
--- /dev/null
+++ b/caffe2/operators/packed_fc_op.cc
@@ -0,0 +1,139 @@
+#include <cstdint>
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/mkl_utils.h"
+
+#ifdef CAFFE2_HAS_MKL_SGEMM_PACK
+
+namespace caffe2 {
+
+CAFFE_KNOWN_TYPE(mkl::MKLPackedMatrix);
+
+namespace mkl {
+
+class PackedFCOp final : public Operator<CPUContext> {
+ public:
+  USE_OPERATOR_FUNCTIONS(CPUContext);
+  PackedFCOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws),
+        axis_(OperatorBase::GetSingleArgument<int32_t>("axis", 1)) {}
+  ~PackedFCOp() {}
+
+  bool RunOnDevice() override {
+    const auto& X = Input(0);
+    const auto& b = Input(2);
+    auto* Y = Output(0);
+    CAFFE_ENFORCE(b.ndim() == 1, b.ndim());
+    // batch size
+    const auto canonical_axis = X.canonical_axis_index(axis_);
+    const int M = X.size_to_dim(canonical_axis);
+    const int K = X.size_from_dim(canonical_axis);
+    const int N = b.size();
+
+    // Check out what is the passed in format.
+    const MKLPackedMatrix* packed_matrix = nullptr;
+    if (OperatorBase::InputIsType<TensorCPU>(1)) {
+      const auto& W = Input(1);
+      CAFFE_ENFORCE_EQ(W.ndim(), 2);
+      CAFFE_ENFORCE_EQ(W.dim32(0), N);
+      CAFFE_ENFORCE_EQ(W.dim32(1), K);
+      // Note(jiayq): This will strictly check that we have a proper usage of
+      // the PackedFC operator. The motivation is that, this operator is
+      // stateful unlike most ops in Caffe2, but checking whether the weight
+      // has changed matters quite a lot in the critical path. We only enable
+      // this test during DEBUG mode for performance considerations.
+      DCHECK(hash_ == 0 || hash_ == Hash(W.template data<float>(), W.size()))
+          << "PackedFCOp is currently stateful: you should not change the "
+             "weight during runtime. This is only sanity-checked in debug "
+             "mode for speed considerations.";
+      if (!local_packed_matrix_.get() || local_packed_matrix_->n_ != M) {
+        // If there is no pre packed matrix, or the batch size changed, we
+        // do a re-pack.
+        // Note that the packed sgemm follows the blas interfaces, not cblas
+        local_packed_matrix_.reset(new MKLPackedMatrix(
+            'A', 'T', N, M, K, 1.f, W.template data<float>(), K));
+      }
+      packed_matrix = local_packed_matrix_.get();
+    } else if (OperatorBase::InputIsType<MKLPackedMatrix>(1)) {
+      packed_matrix = &OperatorBase::Input<MKLPackedMatrix>(1);
+    }
+    CAFFE_ENFORCE_EQ(packed_matrix->m_, N);
+    CAFFE_ENFORCE_EQ(packed_matrix->k_, K);
+    CAFFE_ENFORCE_EQ(packed_matrix->n_, M);
+    // Do we want to check the other flags as well?
+
+    Y->Resize(M, N);
+
+    const float kZero = 0;
+    sgemm_compute(
+        "P",
+        "N",
+        &N,
+        &M,
+        &K,
+        packed_matrix->data_,
+        &K,
+        X.template data<float>(),
+        &K,
+        &kZero,
+        Y->template mutable_data<float>(),
+        &N);
+
+    // Add bias term
+    if (bias_multiplier_.size() != M) {
+      // If the helper bias multiplier is not M, reshape and fill it with one.
+      bias_multiplier_.Resize(M);
+      math::Set<float, CPUContext>(
+          M, 1.f, bias_multiplier_.template mutable_data<float>(), &context_);
+    }
+    math::Gemm<float, CPUContext>(
+        CblasNoTrans,
+        CblasNoTrans,
+        M,
+        N,
+        1,
+        1,
+        bias_multiplier_.template data<float>(),
+        b.template data<float>(),
+        1,
+        Y->template mutable_data<float>(),
+        &context_);
+    return true;
+  }
+
+ protected:
+  uint32_t Hash(const float* ptr, size_t n) {
+    uint32_t hash = 0;
+    const uint32_t* ptr_i = reinterpret_cast<const uint32_t*>(ptr);
+    for (int i = 0; i < n; ++i) {
+      hash ^= ptr_i[i];
+    }
+    return hash;
+  }
+  size_t axis_{1};
+  uint32_t hash_{0};
+  Tensor<CPUContext> bias_multiplier_;
+  std::unique_ptr<MKLPackedMatrix> local_packed_matrix_;
+};
+
+} // namespace mkl
+
+REGISTER_CPU_OPERATOR(PackedFC, mkl::PackedFCOp);
+
+OPERATOR_SCHEMA(PackedFC).NumInputs(3).NumOutputs(1).SetDoc(R"DOC(
+Computes the result of passing an input vector X into a fully connected
+layer with 2D weight matrix W and 1D bias vector b. This is essentially the
+same as the FC operator but allows one to pack the weight matrix for more
+efficient inference. See the schema for the FC op for details.
+
+Unlike many other operators in Caffe2, this operator is stateful: it assumes
+that the input weight matrix W never changes, so it is only suitable for
+inference time when the weight matrix never gets updated by any other ops.
+Due to performance considerations, this is not checked in non-debug builds.
+)DOC");
+
+SHOULD_NOT_DO_GRADIENT(PackedFC);
+} // namespace caffe2
+
+#endif // CAFFE2_HAS_MKL_SGEMM_PACK
diff --git a/caffe2/operators/partition_ops.cc b/caffe2/operators/partition_ops.cc
index ca998b9..84a0b04 100644
--- a/caffe2/operators/partition_ops.cc
+++ b/caffe2/operators/partition_ops.cc
@@ -3,13 +3,14 @@
 namespace caffe2 {
 namespace {
 
-REGISTER_CPU_OPERATOR(Sharding, ShardingOp<CPUContext>);
+REGISTER_CPU_OPERATOR(Partition, PartitionOp);
+REGISTER_CPU_OPERATOR(LengthsPartition, LengthsPartitionOp);
 
-OPERATOR_SCHEMA(Sharding)
-  .NumInputsOutputs([](int in, int out) {
-    return in > 0 && out > 0 && out % in == 0;
-  })
-  .SetDoc(R"DOC(
+OPERATOR_SCHEMA(Shard)
+    .NumInputsOutputs([](int in, int out) {
+      return in > 0 && out > 0 && out % in == 0;
+    })
+    .SetDoc(R"DOC(
 Sharding splits the input int tensor into multiple ones according to the first
 tensor.
 
@@ -25,19 +26,69 @@
 X_ij / num_partitions.
 
 Outputs are ordered as
-X_0_part_0, X_0_part_1, ..., X_0_part_K-1, X_1_part_0, ..., X_N-1_part_K-1
+X_0_part_0, X_1_part_0, ..., X_N-1_part_0, X_0_part_1, ..., X_N-1_part_K-1
 )DOC")
-  .Arg("pack_first_input", "(int, default 0) If set, the operator transforms "
-       "the first tensor values as floor(X_ij / num_partitions)")
-  .Input(0, "input", "Input tensor containing data to be sharded. The "
-         "number of input tensors might be greater than 1 but must have the "
-         "same shape as the previous tensors.")
-  .Output(0, "shards", "Output Shards. The number of output shards has to be a "
-          "multiple of the number of input shards.");
+    .Arg(
+        "pack_first_input",
+        "(int, default 0) If set, the operator transforms "
+        "the first tensor values as floor(X_ij / num_partitions)")
+    .Input(
+        0,
+        "input",
+        "Input tensor containing data to be sharded. The "
+        "number of input tensors might be greater than 1 but must have the "
+        "same shape as the previous tensors.")
+    .Output(
+        0,
+        "shards",
+        "Output Shards. The number of output shards has to be a "
+        "multiple of the number of input shards.");
+
+OPERATOR_SCHEMA(LengthsSharding)
+    .NumInputsOutputs([](int in, int out) {
+      return in >= 2 && out > 0 && out % in == 0;
+    })
+    .SetDoc(R"DOC(
+LengthsSharding splits the input int tensor into multiple ones according to the
+second tensor. The first dimension is expected to be the tensor that describes
+lengths of the elements.
+
+Takes the second input and partitions it to shards according to the remainder of
+values modulo the number of partitions. It requires the second tensor to be
+a 1D-tensor of the integral type. The first tensor should be 1D-tensor of int32
+that would represent the lengths of the elements in the input. The number of
+partitions is derived as (num_output / num_input).
+
+If additional inputs are present they must have the same shape as the first
+input, optionally with extra trailing dimensions. They will be partitioned
+accordingly to the first input.
+
+Optional arg 'pack_first_input' transforms the first tensor values as
+X_ij / num_partitions.
+
+Outputs are ordered as
+X_0_part_0, X_1_part_0, ..., X_N-1_part_0, X_0_part_1, ..., X_N-1_part_K-1
+)DOC")
+    .Arg(
+        "pack_first_input",
+        "(int, default 0) If set, the operator transforms "
+        "the first tensor values as floor(X_ij / num_partitions)")
+    .Input(
+        0,
+        "input",
+        "Input tensor containing data to be sharded. The "
+        "number of input tensors might be greater than 1 but must have the "
+        "same shape as the previous tensors.")
+    .Output(
+        0,
+        "shards",
+        "Output Shards. The number of output shards has to be a "
+        "multiple of the number of input shards.");
 
 // This should actually have gradient, but for now nothing uses it.
 // Because gradient computation right now is not input/output aware it can't be
 // GRADIENT_NOT_IMPLEMENTEDYET
 NO_GRADIENT(Sharding);
+NO_GRADIENT(ShardingLengths);
 } // namespace
 } // namespace caffe2
diff --git a/caffe2/operators/partition_ops.h b/caffe2/operators/partition_ops.h
index f2bbefa..1f1c74f 100644
--- a/caffe2/operators/partition_ops.h
+++ b/caffe2/operators/partition_ops.h
@@ -6,29 +6,27 @@
 
 namespace caffe2 {
 
-template <class Context>
-class ShardingOp : public Operator<Context> {
+class PartitionOpBase : public Operator<CPUContext> {
  public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-  USE_DISPATCH_HELPER;
+  USE_OPERATOR_FUNCTIONS(CPUContext);
 
-  ShardingOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
+  PartitionOpBase(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws),
         OP_SINGLE_ARG(int, "pack_first_input", pack_first_input_, 0) {}
 
-  bool RunOnDevice() override {
-    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(0));
-  }
-
- private:
+ protected:
   template <typename Index>
-  bool DoRunWithType() {
-    CHECK_EQ(OutputSize() % InputSize(), 0)
-        << "Output number must be a multiple of input number";
+  void ApplyPartition(bool skipFirstArgument) {
+    CAFFE_ENFORCE_EQ(
+        OutputSize() % InputSize(),
+        0,
+        "Output number must be a multiple of input number");
     int partitions = OutputSize() / InputSize();
-    CHECK_GT(partitions, 0);
+    int inputSize = InputSize();
+    int mainInputIndex = skipFirstArgument;
+    CAFFE_ENFORCE_GT(partitions, 0, "Invalid number of partitions");
 
-    auto& main_input = Input(0);
+    auto& main_input = Input(mainInputIndex);
     TIndex size = main_input.size();
     const Index* data = main_input.template data<Index>();
     counts_.assign(partitions, 0);
@@ -40,32 +38,43 @@
       ++counts_[shard];
     }
 
-    raw_datas_.resize(InputSize());
-    block_sizes_.resize(InputSize());
+    raw_datas_.resize(inputSize);
+    block_sizes_.resize(inputSize);
+    metas_.resize(inputSize);
     out_datas_.resize(OutputSize());
-    for (int i = 0; i < InputSize(); ++i) {
+    for (int i = mainInputIndex; i < inputSize; ++i) {
       auto& input = Input(i);
-      if (i > 0) {
-        CHECK_GE(input.ndim(), main_input.ndim())
-            << "Prefix of extra input's shape must match main input's shape, "
-            << "input: " << i;
+      if (i > mainInputIndex) {
+        CAFFE_ENFORCE_GE(
+            input.ndim(),
+            main_input.ndim(),
+            "Prefix of extra input's shape must match main input's shape, ",
+            "input: ",
+            i);
         for (int j = 0; j < main_input.ndim(); ++j) {
-          CHECK_GE(input.dim(j), main_input.dim(j))
-              << "Prefix of extra input's shape must match main input's shape, "
-              << "input: " << i << ", dim " << j;
+          CAFFE_ENFORCE_GE(
+              input.dim(j),
+              main_input.dim(j),
+              "Prefix of extra input's shape must match main input's shape, ",
+              "input: ",
+              i,
+              ", dim ",
+              j);
         }
-        CHECK(input.meta().copy() == nullptr)
-            << "Only primitive types are supported, input " << i;
+        CAFFE_ENFORCE(
+            input.meta().copy() == nullptr,
+            "Only primitive types are supported, input ",
+            i);
       }
       raw_datas_[i] = input.raw_data();
-      block_sizes_[i] =
-          input.size_from_dim(main_input.ndim()) * input.itemsize();
+      block_sizes_[i] = input.size_from_dim(main_input.ndim());
+      metas_[i] = input.meta();
       // shape = partition_size + suffix of input dims
       vector<TIndex> shape(
           input.dims().begin() + main_input.ndim() - 1, input.dims().end());
       for (int j = 0; j < partitions; ++j) {
-        int out_idx = i * partitions + j;
-        auto* output = Output(out_idx);
+        int out_idx = i + j * inputSize;
+        auto output = Output(out_idx);
         shape[0] = counts_[j];
         output->Resize(shape);
         out_datas_[out_idx] = output->raw_mutable_data(input.meta());
@@ -81,21 +90,22 @@
       TIndex idx = counts_[shard]++;
 
       // special case first input
-      static_cast<Index*>(out_datas_[shard])[idx] =
+      static_cast<Index*>(out_datas_[shard * inputSize + mainInputIndex])[idx] =
           pack_first_input_ ? ((data[p] - shard) / partitions) : data[p];
 
-      for (int i = 1, j = shard + partitions; i < InputSize();
-           ++i, j += partitions) {
+      int baseIndex = shard * inputSize;
+      for (int i = mainInputIndex + 1; i < inputSize; ++i) {
         auto bs = block_sizes_[i];
+        auto meta = metas_[i];
         // special case for small bs?
-        context_.template CopyBytes<Context, Context>(
+        context_.template CopyItems<CPUContext, CPUContext>(
+            meta,
             bs,
-            static_cast<const char*>(raw_datas_[i]) + p * bs,
-            static_cast<char*>(out_datas_[j]) + idx * bs);
+            static_cast<const char*>(raw_datas_[i]) + p * bs * meta.itemsize(),
+            static_cast<char*>(out_datas_[baseIndex + i]) +
+                idx * bs * meta.itemsize());
       }
     }
-
-    return true;
   }
 
   bool pack_first_input_;
@@ -103,10 +113,101 @@
   // use member fields to reuse memory
   vector<TIndex> counts_;
   vector<TIndex> block_sizes_;
+  vector<TypeMeta> metas_;
   vector<const void*> raw_datas_;
   vector<void*> out_datas_;
+};
 
-  DISABLE_COPY_AND_ASSIGN(ShardingOp);
+class PartitionOp : public PartitionOpBase {
+ public:
+  USE_DISPATCH_HELPER;
+
+  PartitionOp(const OperatorDef& operator_def, Workspace* ws)
+      : PartitionOpBase(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(0));
+  }
+
+ private:
+  template <typename Index>
+  bool DoRunWithType() {
+    ApplyPartition<Index>(false /* skipFirstArgument */);
+    return true;
+  }
+
+  DISABLE_COPY_AND_ASSIGN(PartitionOp);
+};
+
+class LengthsPartitionOp : public PartitionOpBase {
+ public:
+  USE_DISPATCH_HELPER;
+
+  LengthsPartitionOp(const OperatorDef& operator_def, Workspace* ws)
+      : PartitionOpBase(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(1));
+  }
+
+ private:
+  template <typename Index>
+  bool DoRunWithType() {
+    CAFFE_ENFORCE(
+        OutputSize() % InputSize() == 0,
+        "Output number must be a multiple of input number");
+    int partitions = OutputSize() / InputSize();
+    CAFFE_ENFORCE_GT(partitions, 0, "Invalid number of partitions");
+    CAFFE_ENFORCE_EQ(
+        Input(1).ndim(),
+        1,
+        "Only 1-D tensors supported as a partitioning tensor for sharding");
+
+    // Apply sharding to all parameters except lengths
+    ApplyPartition<Index>(true /* skipFirstArgument */);
+
+    // Compute lengths after sharding
+    auto& main_input = Input(1);
+    TIndex size = main_input.size();
+    const Index* data = main_input.template data<Index>();
+
+    auto& length_input = Input(0);
+    TIndex elements = length_input.size();
+    const int32_t* lengths_data = length_input.template data<int32_t>();
+    out_length_.resize(partitions);
+    for (int i = 0; i < partitions; ++i) {
+      auto& output = *Output(i * InputSize());
+      output.Resize(elements);
+      out_length_[i] = output.template mutable_data<int32_t>();
+    }
+
+    int total_length = 0;
+    for (int i = 0; i < elements; ++i) {
+      total_length += lengths_data[i];
+    }
+    CAFFE_ENFORCE(
+        total_length == size,
+        "Total length is not matching to the number of elements");
+
+    int index = 0;
+    for (int i = 0; i < elements; ++i) {
+      for (int j = 0; j < partitions; ++j) {
+        out_length_[j][i] = 0;
+      }
+      for (int j = 0; j < lengths_data[i]; ++j, ++index) {
+        // TODO: support other partition functions
+        int shard = data[index] % partitions;
+        // equivalent to `if (shard < 0) shard += partitions;`
+        shard += partitions & (shard >> (sizeof(int) * 8 - 1));
+        ++out_length_[shard][i];
+      }
+    }
+    return true;
+  }
+
+  DISABLE_COPY_AND_ASSIGN(LengthsPartitionOp);
+
+  vector<int32_t*> out_length_;
 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/recurrent_network_op.h b/caffe2/operators/recurrent_network_op.h
index e73eac9..a15ef6af 100644
--- a/caffe2/operators/recurrent_network_op.h
+++ b/caffe2/operators/recurrent_network_op.h
@@ -336,14 +336,15 @@
     const auto stepNet =
         OperatorBase::GetSingleArgument<string>("backward_step_net", "");
     NetDef stepNetDef;
-    CHECK(google::protobuf::TextFormat::ParseFromString(stepNet, &stepNetDef));
+    CAFFE_ENFORCE(
+        google::protobuf::TextFormat::ParseFromString(stepNet, &stepNetDef));
     ws_.CreateBlob(timestep_)->template GetMutable<TensorCPU>()->Resize(1);
 
     for (const auto& blob : stepNetDef.external_input()) {
       ws_.CreateBlob(blob);
     }
     stepNet_ = ws_.CreateNet(stepNetDef);
-    CHECK(stepNet_);
+    CAFFE_ENFORCE(stepNet_);
   }
 
   std::vector<detail::Scratch> constructScratches(Workspace* sharedWs) {
@@ -483,7 +484,7 @@
                             ->template Get<Tensor<Context>>();
         auto* ag = CHECK_NOTNULL(ws_.GetBlob(param.accGrad))
                        ->template GetMutable<Tensor<Context>>();
-        CHECK(ag->dims() == g.dims());
+        CAFFE_ENFORCE(ag->dims() == g.dims());
         math::Add<T, Context>(
             g.size(),
             g.template data<T>(),
diff --git a/caffe2/operators/recurrent_op_cudnn.cc b/caffe2/operators/recurrent_op_cudnn.cc
index d8379a5..6dffab1 100644
--- a/caffe2/operators/recurrent_op_cudnn.cc
+++ b/caffe2/operators/recurrent_op_cudnn.cc
@@ -72,7 +72,7 @@
   CHECK_GT(hiddenSize, 0);
   const auto bidirectional =
       OperatorBase::GetSingleArgument<int>("bidirectional", 0);
-  CHECK(bidirectional == 0 || bidirectional == 1);
+  CAFFE_ENFORCE(bidirectional == 0 || bidirectional == 1);
   const auto numDirections = bidirectional == 1 ? 2 : 1;
   const auto outputDim = hiddenSize * numDirections;
   const auto rnnDirection =
@@ -81,11 +81,11 @@
   CHECK_GT(numLayers, 0);
   const auto& rnnModeStr =
       OperatorBase::GetSingleArgument<string>("rnn_mode", "");
-  CHECK(rnnModeStr == "lstm" || rnnModeStr == "gru");
+  CAFFE_ENFORCE(rnnModeStr == "lstm" || rnnModeStr == "gru");
   const auto rnnMode = rnnModeStr == "lstm" ? CUDNN_LSTM : CUDNN_GRU;
   const auto& rnnInputStr =
       OperatorBase::GetSingleArgument<string>("input_mode", "");
-  CHECK(rnnInputStr == "linear" || rnnInputStr == "skip");
+  CAFFE_ENFORCE(rnnInputStr == "linear" || rnnInputStr == "skip");
   const auto rnnInput =
       rnnInputStr == "linear" ? CUDNN_LINEAR_INPUT : CUDNN_SKIP_INPUT;
   // Dropout setup
diff --git a/caffe2/operators/segment_reduction_op.cc b/caffe2/operators/segment_reduction_op.cc
index 0fe44be..d21ae39 100644
--- a/caffe2/operators/segment_reduction_op.cc
+++ b/caffe2/operators/segment_reduction_op.cc
@@ -39,15 +39,19 @@
     const SIndex* s_ids = segment_ids.template data<SIndex>();
     const T* d = data.template data<T>();
 
-    CHECK_GT(N, 0);
-    const SIndex K = s_ids[N - 1] + 1;
+    const SIndex K = N > 0 ? s_ids[N - 1] + 1 : 0;
     auto shape = data.dims();
     shape[0] = K;
     output->Resize(shape);
 
-    TIndex block_size = data.size() / N;
     T* out = output->template mutable_data<T>();
 
+    if (N == 0) {
+      return true;
+    }
+
+    TIndex block_size = data.size() / N;
+
     // Assume the segments are sorted and there are no gaps
     CHECK_EQ(0, s_ids[0]) << "Indices must be sorted and not have gaps";
     for (TIndex i = 0; i < N;) {
@@ -106,9 +110,14 @@
     data_grads->Resize(shape);
 
     const SIndex K = segment_grads.dim(0);
-    TIndex block_size = segment_grads.size() / K;
     T* out = data_grads->template mutable_data<T>();
 
+    if (N == 0) {
+      return true;
+    }
+
+    TIndex block_size = segment_grads.size_from_dim(1);
+
     // Assume the segments are sorted and there are no gaps
     CHECK_EQ(0, s_ids[0]) << "Indices must be sorted and not have gaps";
     // repeat the check from forward op
@@ -264,7 +273,7 @@
     output->Resize(shape);
 
     TIndex in_block_size = data.size_from_dim(num_reduce_dims_);
-    TIndex block_num = data.size() / in_block_size;
+    TIndex block_num = in_block_size > 0 ? data.size() / in_block_size : 0;
     T* out = output->template mutable_data<T>();
 
     Reducer r(ctx, out, &context_);
@@ -321,7 +330,7 @@
     data_grads->Resize(shape);
 
     TIndex block_size = data_grads->size_from_dim(num_reduce_dims_);
-    TIndex block_num = data_grads->size() / block_size;
+    TIndex block_num = block_size > 0 ? data_grads->size() / block_size : 0;
     T* out = data_grads->template mutable_data<T>();
 
     ReducerGradient r(ctx, r_grad, &context_);
@@ -431,11 +440,9 @@
   USE_SIMPLE_CTOR_DTOR(AbstractSortedSegmentOp);
 
   bool RunOnDevice() override {
-    auto& data = Input(0);
-    const TIndex M = data.dim(0);
     // If more complicated fixed size logic becomes necessary, it can be moved
     // to the reducer class
-    TIndex in_block_size = data.size() / M;
+    TIndex in_block_size = Input(0).size_from_dim(1);
     return DispatchHelper<typename Reducer::FixedDispatch>::call(
         this, in_block_size);
   }
@@ -477,16 +484,18 @@
     const SIndex* s_ids = segment_ids.template data<SIndex>();
     const T* d = data.template data<T>();
 
-    CHECK_GT(N, 0);
-    const SIndex K = s_ids[N - 1] + 1;
+    const SIndex K = N > 0 ? s_ids[N - 1] + 1 : 0;
     vector<TIndex> shape;
     shape.push_back(K);
     ctx.appendOutputShape(&shape);
     output->Resize(shape);
 
-    TIndex in_block_size = data.size() / M;
-    TIndex out_block_size = output->size() / K;
     T* out = output->template mutable_data<T>();
+    if (N == 0) {
+      return true;
+    }
+    TIndex in_block_size = data.size_from_dim(1);
+    TIndex out_block_size = output->size_from_dim(1);
 
     // Assume the segments are sorted and there are no gaps
     CHECK_EQ(0, s_ids[0]) << "Indices must be sorted and not have gaps";
@@ -497,8 +506,12 @@
       for (; i < N && s_ids[start] == s_ids[i]; ++i) {
         TIndex idx;
         if (SparseFused) { // static if
-          CHECK(0 <= idxs[i] && idxs[i] < M)
-              << "Index out of bounds: " << idxs[i] << ", range 0 to " << M;
+          CAFFE_ENFORCE(
+              0 <= idxs[i] && idxs[i] < M,
+              "Index out of bounds: ",
+              idxs[i],
+              ", range 0 to ",
+              M);
           idx = idxs[i];
         } else {
           idx = i;
@@ -532,10 +545,9 @@
   USE_SIMPLE_CTOR_DTOR(AbstractSortedSegmentGradientOp);
 
   bool RunOnDevice() override {
-    auto& segment_grads = Input(SEGMENT_GRADS);
     // If more complicated fixed size logic becomes necessary, it can be moved
     // to the reducer class
-    TIndex grad_block_size = segment_grads.size() / segment_grads.dim(0);
+    TIndex grad_block_size = Input(SEGMENT_GRADS).size_from_dim(1);
     return DispatchHelper<typename ReducerGradient::FixedDispatch>::call(
         this, grad_block_size);
   }
@@ -566,11 +578,15 @@
     ctx.appendGradShape(&shape);
     data_grads->Resize(shape);
 
-    TIndex d_block_size = data_grads->size() / data_grads->dim(0);
+    TIndex d_block_size = data_grads->size_from_dim(1);
     const SIndex K = segment_grads.dim(0);
-    TIndex s_block_size = segment_grads.size() / K;
+    TIndex s_block_size = segment_grads.size_from_dim(1);
     T* out = data_grads->template mutable_data<T>();
 
+    if (N == 0) {
+      return true;
+    }
+
     // Assume the segments are sorted and there are no gaps
     CHECK_EQ(0, s_ids[0]) << "Indices must be sorted and not have gaps";
     // repeat the check from forward op
@@ -784,11 +800,9 @@
         OP_SINGLE_ARG(int, "num_segments", num_segments_, -1) {}
 
   bool RunOnDevice() override {
-    auto& data = Input(0);
-    const TIndex M = data.dim(0);
     // If more complicated fixed size logic becomes necessary, it can be moved
     // to the reducer class
-    TIndex in_block_size = data.size() / M;
+    TIndex in_block_size = Input(0).size_from_dim(1);
     return DispatchHelper<typename Reducer::FixedDispatch>::call(
         this, in_block_size);
   }
@@ -846,8 +860,8 @@
     ctx.appendOutputShape(&shape);
     output->Resize(shape);
 
-    TIndex in_block_size = data.size() / M;
-    TIndex out_block_size = output->size() / K;
+    TIndex in_block_size = data.size_from_dim(1);
+    TIndex out_block_size = output->size_from_dim(1);
     T* out = output->template mutable_data<T>();
 
     reducers_.clear();
@@ -858,12 +872,20 @@
 
     for (TIndex i = 0; i < N; ++i) {
       auto s_id = s_ids[i];
-      CHECK(0 <= s_id && s_id < K) << "Segment id out of range: " << s_id
-                                   << ", range 0 to " << K;
+      CAFFE_ENFORCE(
+          0 <= s_id && s_id < K,
+          "Segment id out of range: ",
+          s_id,
+          ", range 0 to ",
+          K);
       TIndex idx;
       if (SparseFused) { // static if
-        CHECK(0 <= idxs[i] && idxs[i] < M) << "Index out of bounds: " << idxs[i]
-                                           << ", range 0 to " << M;
+        CAFFE_ENFORCE(
+            0 <= idxs[i] && idxs[i] < M,
+            "Index out of bounds: ",
+            idxs[i],
+            ", range 0 to ",
+            M);
         idx = idxs[i];
       } else {
         idx = i;
@@ -897,10 +919,9 @@
   USE_SIMPLE_CTOR_DTOR(AbstractUnsortedSegmentGradientOp);
 
   bool RunOnDevice() override {
-    auto& segment_grads = Input(SEGMENT_GRADS);
     // If more complicated fixed size logic becomes necessary, it can be moved
     // to the reducer class
-    TIndex grad_block_size = segment_grads.size() / segment_grads.dim(0);
+    TIndex grad_block_size = Input(SEGMENT_GRADS).size_from_dim(1);
     return DispatchHelper<typename ReducerGradient::FixedDispatch>::call(
         this, grad_block_size);
   }
@@ -931,9 +952,9 @@
     ctx.appendGradShape(&shape);
     data_grads->Resize(shape);
 
-    TIndex d_block_size = data_grads->size() / data_grads->dim(0);
+    TIndex d_block_size = data_grads->size_from_dim(1);
     const SIndex K = segment_grads.dim(0);
-    TIndex s_block_size = segment_grads.size() / K;
+    TIndex s_block_size = segment_grads.size_from_dim(1);
     T* out = data_grads->template mutable_data<T>();
 
     reducers_.clear();
@@ -944,8 +965,12 @@
 
     for (TIndex i = 0; i < N; ++i) {
       auto s_id = s_ids[i];
-      CHECK(0 <= s_id && s_id < K) << "Segment id out of range: " << s_id
-                                   << ", range 0 to " << K;
+      CAFFE_ENFORCE(
+          0 <= s_id && s_id < K,
+          "Segment id out of range: ",
+          s_id,
+          ", range 0 to ",
+          K);
       reducers_[s_id].template fillGrad<FixedSize>(
           ctx, out + d_block_size * i, i, &context_);
     }
@@ -1086,6 +1111,351 @@
       true /*SparseFused*/>;
 };
 
+/**
+ * @brief Segment reduction op with optional fused embedding lookup
+ *
+ * Base implementation for LengthsXXX and SparseLengthsXXX depending
+ * on SparseFused static argument.
+ *
+ * Inputs:
+ *   0: DATA - input embedding to do lookups in
+ *   1..P: AUX_ARG_<I> - optional additional arguments to be passed to the
+ *                       reducer, should have the same first dimension as
+ *                       LENGTHS (e.g. scalars in WeightedSum)
+ *   # if SparseFused == true:
+ *   P+1: INDICES - 1-D vector with indices to look up in DATA. Should have the
+ *                  same dimension as LENGTHS
+ *   # P+1 if SparseFused == false:
+ *   P+1 or P+2: LENGTHS - lengths on indecies vector
+ *
+ * Output:
+ *   Tensor with first dimension of K, where K = len(LENGTHS). Rest
+ *   of dimensions are decided by reducer but usually are the same size as extra
+ *   dimensions of DATA
+ */
+// TODO(dzhulgakov): for now it's implemented with incremental reducers because
+// of fused sparse support. But using "lengths" representation actually implies
+// continuous segments and thus range reducers can be used for non-sparse
+// version.
+template <
+    typename TData,
+    typename TLengths,
+    class Context,
+    class Reducer,
+    bool SparseFused = true>
+class AbstractLengthsOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(AbstractLengthsOp);
+
+  bool RunOnDevice() override {
+    // If more complicated fixed size logic becomes necessary, it can be moved
+    // to the reducer class
+    TIndex dataBlockSize = Input(0).size_from_dim(1);
+    return DispatchHelper<typename Reducer::FixedDispatch>::call(
+        this, dataBlockSize);
+  }
+
+  template <int FixedSize>
+  bool DoRunWithValue() {
+    auto& dataInput = Input(0);
+    auto& lengthsInput = Input(LENGTHS);
+    auto* output = Output(0);
+
+    CHECK_EQ(1, lengthsInput.ndim()) << "LENGTHS must be a vector";
+    const TIndex dataSize = dataInput.dim(0);
+    // Either first dim the data or how much we pull in indexies from it
+    TIndex dataToReduceSize;
+    const TIndex outputSize = lengthsInput.dim(0);
+
+    const TIndex* indicies;
+    if (SparseFused) { // static if
+      auto& indicesInput = Input(INDICES);
+      CHECK_EQ(1, indicesInput.ndim()) << "INDICES must be a vector";
+      indicies = indicesInput.template data<TIndex>();
+      dataToReduceSize = indicesInput.dim(0);
+    } else {
+      dataToReduceSize = dataSize;
+    }
+
+    typename Reducer::Meta ctx;
+    ctx.observeInput(0, dataInput, 1);
+    for (int i = 1; i < Reducer::kInputCount; ++i) {
+      auto& aux_in = Input(i);
+      CAFFE_ENFORCE(
+          dataToReduceSize == aux_in.dim(0),
+          "Input ",
+          i,
+          " must have have the same first dim as SEGMENT_IDS");
+      ctx.observeInput(i, aux_in, 1);
+    }
+
+    const TLengths* lengths = lengthsInput.template data<TLengths>();
+    const TData* data = dataInput.template data<TData>();
+
+    vector<TIndex> shape{outputSize};
+    ctx.appendOutputShape(&shape);
+    output->Resize(shape);
+
+    TIndex in_block_size = dataInput.size_from_dim(1);
+    TIndex out_block_size = output->size_from_dim(1);
+    TData* out = output->template mutable_data<TData>();
+
+    TIndex dataIndex = 0;
+    for (TIndex rangeIndex = 0; rangeIndex < outputSize; ++rangeIndex) {
+      Reducer reducer(ctx, out + out_block_size * rangeIndex, &context_);
+      for (TIndex start = dataIndex; dataIndex < start + lengths[rangeIndex];
+           ++dataIndex) {
+        TIndex idx;
+        if (SparseFused) { // static if
+          idx = indicies[dataIndex];
+          CAFFE_ENFORCE(
+              0 <= idx && idx < dataSize,
+              "Index ",
+              dataIndex,
+              " is out of bounds: ",
+              idx,
+              ", range 0 to ",
+              dataSize);
+        } else {
+          idx = dataIndex;
+          CAFFE_ENFORCE(
+              idx < dataSize,
+              "Range ",
+              rangeIndex,
+              " of length ",
+              lengths[rangeIndex],
+              " is out of bound ",
+              dataSize);
+        }
+        reducer.template process<FixedSize>(
+            ctx, data + in_block_size * idx, dataIndex, &context_);
+      }
+    }
+    CAFFE_ENFORCE(
+        dataIndex == dataToReduceSize, dataIndex, " != ", dataToReduceSize);
+    return true;
+  }
+
+  enum {
+    INDICES = Reducer::kInputCount,
+    LENGTHS = Reducer::kInputCount + (SparseFused ? 1 : 0)
+  };
+  static constexpr int kSelfInputs = SparseFused ? 2 : 1;
+  static constexpr int kNumInputs = Reducer::kInputCount + kSelfInputs;
+};
+
+// Gradient actually doesn't depend on whether sparse lookup is fused or not
+template <typename T, typename TLengths, class Context, class ReducerGradient>
+class AbstractLengthsGradientOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(AbstractLengthsGradientOp);
+
+  bool RunOnDevice() override {
+    // If more complicated fixed size logic becomes necessary, it can be moved
+    // to the reducer class
+    TIndex gradBlockSize = Input(SEGMENT_GRADS).size_from_dim(1);
+    return DispatchHelper<typename ReducerGradient::FixedDispatch>::call(
+        this, gradBlockSize);
+  }
+
+  template <int FixedSize>
+  bool DoRunWithValue() {
+    auto& segmentGradsInput = Input(SEGMENT_GRADS);
+    auto& lengthsInput = Input(LENGTHS);
+    auto* dataGradsOutput = Output(0);
+
+    CAFFE_ENFORCE(lengthsInput.ndim() == 1, "LENGTHS must be a vector");
+    TIndex reducedDataSize = 0;
+    TIndex numSegments = lengthsInput.dim(0);
+    CAFFE_ENFORCE(segmentGradsInput.ndim() > 0);
+    CAFFE_ENFORCE(numSegments == segmentGradsInput.dim(0));
+    const TLengths* lengths = lengthsInput.template data<TLengths>();
+    for (TIndex i = 0; i < numSegments; ++i) {
+      reducedDataSize += lengths[i];
+    }
+
+    typename ReducerGradient::Meta ctx(segmentGradsInput, 1);
+    for (int i = 0; i < ReducerGradient::originalInputs().size(); ++i) {
+      auto& aux_in = Input(i);
+      CAFFE_ENFORCE_EQ(
+          reducedDataSize,
+          aux_in.dim(0),
+          "Input ",
+          i,
+          " must have have the same first dim as SEGMENT_IDS");
+      ctx.observeOriginalInput(ReducerGradient::originalInputs()[i], aux_in, 1);
+    }
+
+    const T* segmentGrads = segmentGradsInput.template data<T>();
+
+    vector<TIndex> shape;
+    shape.push_back(reducedDataSize);
+    ctx.appendGradShape(&shape);
+    dataGradsOutput->Resize(shape);
+
+    TIndex dataGradsBlockSize = dataGradsOutput->size_from_dim(1);
+    TIndex segmentBlockSize = segmentGradsInput.size_from_dim(1);
+    T* dataGrads = dataGradsOutput->template mutable_data<T>();
+
+    TIndex dataIndex = 0;
+    for (TIndex rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) {
+      ReducerGradient reducer(
+          ctx, segmentGrads + segmentBlockSize * rangeIndex, &context_);
+      for (TIndex start = dataIndex; dataIndex < start + lengths[rangeIndex];
+           ++dataIndex) {
+        reducer.template fillGrad<FixedSize>(
+            ctx,
+            dataGrads + dataGradsBlockSize * dataIndex,
+            dataIndex,
+            &context_);
+      }
+    }
+    CAFFE_ENFORCE(
+        dataIndex == reducedDataSize, dataIndex, " != ", reducedDataSize);
+    return true;
+  }
+
+  // Input layout:
+  //   orig_arg1, orig_arg2, ..., orig_argN, SEGMENT_GRADS, SEGMENT_IDS
+  // orig_argXs represent original op's inputs and will be passed to the reducer
+  // directly
+  static constexpr int kNumInputs =
+      ReducerGradient::originalInputs().size() + 2;
+  enum _InputTags {
+    SEGMENT_GRADS = ReducerGradient::originalInputs().size(),
+    LENGTHS
+  };
+};
+
+// base implementation of sparse/non-sparse gradient computation
+template <
+    typename ForwardOp,
+    typename ReducerDef,
+    typename ReducerGradient,
+    bool SparseFused>
+struct LengthsOpGetGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    vector<string> grad_ins;
+    for (const int i : ReducerGradient::originalInputs()) {
+      grad_ins.push_back(I(i));
+    }
+    grad_ins.push_back(GO(0));
+    grad_ins.push_back(I(ForwardOp::LENGTHS));
+    vector<OperatorDef> r{CreateOperatorDef(
+        string("Lengths") + ReducerDef::name + "Gradient",
+        "",
+        grad_ins,
+        // no gradient on segment_ids or auxiliary inputs for now
+        vector<string>{SparseFused ? GI_V(0) : GI(0)})};
+    if (SparseFused) {
+      SetSparse(0, I(ForwardOp::INDICES), GI_V(0));
+    }
+    return r;
+  }
+};
+
+template <typename T, typename SIndex, typename Context, typename ReducerDef>
+struct AbstractLengthsDef {
+  using OpDef = ReducerDef;
+  static constexpr const char* basename = "Lengths";
+  static constexpr const char* doc = R"DOC(
+Applies '{op}' to each segment of the input tensor. Segments are defined
+by their LENGTHS.
+
+LENGTHS is a vector that maps each of the first dimension slices of the
+DATA to a particular group (segment). Values belonging to the same segment are
+aggregated together.
+
+For example LENGTHS = [2, 1] stands for segments DATA[0..1] and DATA[2]
+
+The first dimension of the output is equal to the number of input segments,
+i.e. `len(LENGTHS)`. Other dimensions are inherited from the input tensor.
+
+{op_doc}
+  )DOC";
+  static void PopulateSchema(OpSchema& schema) {
+    schema.Input(0, "DATA", "Input tensor, slices of which are aggregated.");
+    schema.Input(
+        Reducer::kInputCount,
+        "LENGTHS",
+        "Vector with the same sum of elements as the first dimension of DATA");
+    schema.Output(
+        0,
+        "OUTPUT",
+        "Aggregated output tensor. Has the first dimension of len(LENGTHS) ");
+    ReducerDef::PopulateSchema(schema);
+  }
+  using Reducer = typename ReducerDef::template Reducer<T, Context>;
+  using ReducerGradient =
+      typename ReducerDef::template ReducerGradient<T, Context>;
+  using ForwardOp = AbstractLengthsOp<T, SIndex, Context, Reducer, false>;
+  using BackwardOp =
+      AbstractLengthsGradientOp<T, SIndex, Context, ReducerGradient>;
+  using GetGradient = LengthsOpGetGradient<
+      ForwardOp,
+      ReducerDef,
+      ReducerGradient,
+      false /*SparseFused*/>;
+};
+
+template <typename T, typename SIndex, typename Context, typename ReducerDef>
+struct AbstractSparseLengthsDef {
+  using OpDef = ReducerDef;
+  static constexpr const char* basename = "SparseLengths";
+  static constexpr const char* doc = R"DOC(
+Pulls in slices of the input tensor, groups them into segments and applies
+'{op}' to each segment. Segments are defined by their LENGTHS.
+
+This op is basically Gather and Lengths{op} fused together.
+
+INDICES should contain integers in range 0..N-1 where N is the first dimension
+of DATA. INDICES represent which slices of DATA need to be pulled in.
+
+LENGTHS is a vector that defines slice sizes by first dimention of DATA. Values
+belonging to the same segment are aggregated together. sum(LENGTHS) has
+to match INDICES size.
+
+The first dimension of the output is equal to the number of input segment,
+i.e. `len(LENGTHS)`. Other dimensions are inherited from the input tensor.
+
+{op_doc}
+  )DOC";
+  static void PopulateSchema(OpSchema& schema) {
+    schema.Input(0, "DATA", "Input tensor, slices of which are aggregated.");
+    schema.Input(
+        Reducer::kInputCount,
+        "INDICES",
+        "Integer vector containing indices of the first dimension of DATA for "
+        "the slices that are being aggregated");
+    schema.Input(
+        Reducer::kInputCount + 1,
+        "LENGTHS",
+        "Non negative vector with sum of elements equal to INDICES length");
+    schema.Output(
+        0,
+        "OUTPUT",
+        "Aggregated output tensor. Has the first dimension of K "
+        "(the number of segments).");
+    ReducerDef::PopulateSchema(schema);
+  }
+  using Reducer = typename ReducerDef::template Reducer<T, Context>;
+  using ReducerGradient =
+      typename ReducerDef::template ReducerGradient<T, Context>;
+  using ForwardOp = AbstractLengthsOp<T, SIndex, Context, Reducer>;
+  // TODO(dzhulgakov): we're registering the same class twice here,
+  // consider avoiding op duplication here
+  using BackwardOp =
+      AbstractLengthsGradientOp<T, SIndex, Context, ReducerGradient>;
+  using GetGradient = LengthsOpGetGradient<
+      ForwardOp,
+      ReducerDef,
+      ReducerGradient,
+      true /*SparseFused*/>;
+};
+
 namespace {
 
 template <typename Def>
@@ -1135,17 +1505,21 @@
 REGISTER_SEGMENT_DEF(
     AbstractSortedSegmentRangeDef<float, int, CPUContext, MaxRangeReducerDef>);
 
-#define REGISTER_REDUCER_WITH_ALL_OPS(reducer_def)                          \
-  REGISTER_SEGMENT_DEF(                                                     \
-      AbstractReduceFrontDef<float, CPUContext, reducer_def>);              \
-  REGISTER_SEGMENT_DEF(                                                     \
-      AbstractSortedSegmentDef<float, int, CPUContext, reducer_def>);       \
-  REGISTER_SEGMENT_DEF(                                                     \
-      AbstractSparseSortedSegmentDef<float, int, CPUContext, reducer_def>); \
-  REGISTER_SEGMENT_DEF(                                                     \
-      AbstractUnsortedSegmentDef<float, int, CPUContext, reducer_def>);     \
-  REGISTER_SEGMENT_DEF(                                                     \
-      AbstractSparseUnsortedSegmentDef<float, int, CPUContext, reducer_def>)
+#define REGISTER_REDUCER_WITH_ALL_OPS(reducer_def)                           \
+  REGISTER_SEGMENT_DEF(                                                      \
+      AbstractReduceFrontDef<float, CPUContext, reducer_def>);               \
+  REGISTER_SEGMENT_DEF(                                                      \
+      AbstractSortedSegmentDef<float, int, CPUContext, reducer_def>);        \
+  REGISTER_SEGMENT_DEF(                                                      \
+      AbstractSparseSortedSegmentDef<float, int, CPUContext, reducer_def>);  \
+  REGISTER_SEGMENT_DEF(                                                      \
+      AbstractUnsortedSegmentDef<float, int, CPUContext, reducer_def>);      \
+  REGISTER_SEGMENT_DEF(                                                      \
+      AbstractSparseUnsortedSegmentDef<float, int, CPUContext, reducer_def>) \
+  REGISTER_SEGMENT_DEF(                                                      \
+      AbstractLengthsDef<float, int, CPUContext, reducer_def>)               \
+  REGISTER_SEGMENT_DEF(                                                      \
+      AbstractSparseLengthsDef<float, int, CPUContext, reducer_def>)
 
 REGISTER_REDUCER_WITH_ALL_OPS(SumReducerDef);
 REGISTER_REDUCER_WITH_ALL_OPS(WeightedSumReducerDef);
diff --git a/caffe2/operators/sequence_ops.cc b/caffe2/operators/sequence_ops.cc
index 9d6a394..d7b10e1 100644
--- a/caffe2/operators/sequence_ops.cc
+++ b/caffe2/operators/sequence_ops.cc
@@ -34,17 +34,17 @@
   bool DoRunWithType() {
     const auto& in = Input(0);
     CHECK_GE(in.ndim(), 1);
-    const auto outer_size = in.dims()[0];
+    const int32_t outer_size = in.dims()[0];
     const auto block_size = std::accumulate(
         in.dims().begin() + 1, in.dims().end(), 1, std::multiplies<TIndex>());
     const auto pad_width = startPaddingWidth_ + endPaddingWidth_;
 
     // if no lengths is provided, assume it is a single full-span entry
-    const int64_t* lengths_ptr = &outer_size;
+    const int32_t* lengths_ptr = &outer_size;
     int64_t lengths_size = 1;
     if (InputSize() > 1) {
       const auto& lengths = Input(1);
-      lengths_ptr = lengths.data<int64_t>();
+      lengths_ptr = lengths.data<int32_t>();
       lengths_size = lengths.size();
     }
 
@@ -124,17 +124,17 @@
   bool DoRunWithType() {
     const auto& in = Input(0);
     CHECK_GE(in.ndim(), 1);
-    const auto outer_size = in.dims()[0];
+    const int32_t outer_size = in.dims()[0];
     const auto block_size = std::accumulate(
         in.dims().begin() + 1, in.dims().end(), 1, std::multiplies<TIndex>());
     const auto pad_width = startPaddingWidth_ + endPaddingWidth_;
 
     // if no lengths is provided, assume it is a single full-span entry
-    const int64_t* lengths_ptr = &outer_size;
+    const int32_t* lengths_ptr = &outer_size;
     int64_t lengths_size = 1;
     if (InputSize() > 1) {
       const auto& lengths = Input(1);
-      lengths_ptr = lengths.data<int64_t>();
+      lengths_ptr = lengths.data<int32_t>();
       lengths_size = lengths.size();
     }
 
@@ -167,8 +167,8 @@
     std::transform(
         lengths_ptr,
         lengths_ptr + lengths_size,
-        lengths_out->mutable_data<int64_t>(),
-        [pad_width](int64_t x) { return x - pad_width; });
+        lengths_out->mutable_data<int32_t>(),
+        [pad_width](int32_t x) { return x - pad_width; });
     return true;
   }
 
@@ -207,16 +207,16 @@
   bool DoRunWithType() {
     const auto& in = Input(0);
     CHECK_GE(in.ndim(), 1);
-    const auto outer_size = in.dims()[0];
+    const int32_t outer_size = in.dims()[0];
     const auto block_size = std::accumulate(
         in.dims().begin() + 1, in.dims().end(), 1, std::multiplies<TIndex>());
 
     // if no lengths is provided, assume it is a single full-span entry
-    const int64_t* lengths_ptr = &outer_size;
+    const int32_t* lengths_ptr = &outer_size;
     int64_t lengths_size = 1;
     if (InputSize() > 1) {
       const auto& lengths = Input(1);
-      lengths_ptr = lengths.data<int64_t>();
+      lengths_ptr = lengths.data<int32_t>();
       lengths_size = lengths.size();
     }
 
@@ -288,8 +288,8 @@
     std::transform(
         lengths_ptr,
         lengths_ptr + lengths_size,
-        lengths_out->mutable_data<int64_t>(),
-        [pad_width](int64_t x) { return x + pad_width; });
+        lengths_out->mutable_data<int32_t>(),
+        [pad_width](int32_t x) { return x + pad_width; });
     return true;
   }
 
diff --git a/caffe2/operators/sparse_to_dense_mask_op.cc b/caffe2/operators/sparse_to_dense_mask_op.cc
index 8e641b3..4ab1d2c 100644
--- a/caffe2/operators/sparse_to_dense_mask_op.cc
+++ b/caffe2/operators/sparse_to_dense_mask_op.cc
@@ -36,23 +36,6 @@
 
   template <typename TInd>
   bool DoRunWithType() {
-    if (InputSize() < 4) {
-      return DoRunWithTypeAndLength<TInd, int32_t>();
-    } else {
-      const TypeMeta& meta = Input(LENGTHS).meta();
-      if (meta.Match<int32_t>()) {
-        return DoRunWithTypeAndLength<TInd, int32_t>();
-      } else if (meta.Match<int64_t>()) {
-        return DoRunWithTypeAndLength<TInd, int64_t>();
-      } else {
-        CAFFE_THROW("Unsupported type of tensor: ", meta.name());
-        return false;
-      }
-    }
-  }
-
-  template <typename TInd, typename TLen>
-  bool DoRunWithTypeAndLength() {
     auto& sparse_indices = Input(INDICES);
     CAFFE_ENFORCE(sparse_indices.ndim() == 1);
     auto& sparse_values = Input(VALUES);
@@ -73,14 +56,14 @@
 
     int cols = featuresCount_;
     int rows = 0;
-    TLen default_length = sparse_indices.dim32(0);
-    const TLen* lengths_vec = nullptr;
+    int32_t default_length = sparse_indices.dim32(0);
+    const int32_t* lengths_vec = nullptr;
     auto* output = Output(0);
     vector<TIndex> shape;
     if (InputSize() == 4) {
       auto& lengths = Input(LENGTHS);
       CAFFE_ENFORCE(lengths.ndim() == 1);
-      lengths_vec = lengths.data<TLen>();
+      lengths_vec = lengths.data<int32_t>();
       rows = lengths.dim32(0);
     }
     if (rows == 0) {
@@ -107,7 +90,7 @@
           output_data + i * block_nbytes);
     }
 
-    TLen offset = 0;
+    int32_t offset = 0;
     for (int r = 0; r < rows; r++) {
       for (int c = 0; c < lengths_vec[r]; c++) {
         int idx = getFeatureIdx(sparse_indices_vec[offset + c]);
@@ -163,8 +146,8 @@
 the value of `default_value`. After running this op:
 
 ```
-output[indices[i], :] = values[i]
-output[j, :] = default_value # for j not in indices
+output[j, :] = values[i] # where mask[j] == indices[i]
+output[j, ...] = default_value # when mask[j] doesn't appear in indices
 ```
 
 If `lengths` is provided and not empty, and extra "batch" dimension is prepended
diff --git a/caffe2/operators/spatial_batch_norm_op.cc b/caffe2/operators/spatial_batch_norm_op.cc
index 47b81ca..2378b90 100644
--- a/caffe2/operators/spatial_batch_norm_op.cc
+++ b/caffe2/operators/spatial_batch_norm_op.cc
@@ -139,7 +139,89 @@
 
 template <>
 bool SpatialBNGradientOp<CPUContext>::RunOnDevice() {
-  CAFFE_THROW("Spatial BN gradient on the CPU is not implemented yet.");
+  const auto& X = Input(INPUT);
+  const auto& dY = Input(OUTPUT_GRAD);
+  const auto& scale = Input(SCALE);
+
+  DCHECK_EQ(X.ndim(), 4);
+  const int N = X.dim32(0);
+  const int C = (order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(3));
+  const int H = (order_ == StorageOrder::NCHW ? X.dim32(2) : X.dim32(1));
+  const int W = (order_ == StorageOrder::NCHW ? X.dim32(3) : X.dim32(2));
+  DCHECK_EQ(scale.ndim(), 1);
+  DCHECK_EQ(scale.dim32(0), C);
+
+  ConstEigenVectorArrayMap<float> scale_arr(scale.data<float>(), C);
+  ConstEigenVectorArrayMap<float> mean_arr(Input(SAVED_MEAN).data<float>(), C);
+  ConstEigenVectorArrayMap<float> inv_var_arr(
+      Input(SAVED_INV_VAR).data<float>(), C);
+
+  auto* dX = Output(INPUT_GRAD);
+  auto* dScale = Output(SCALE_GRAD);
+  auto* dBias = Output(BIAS_GRAD);
+  dX->ResizeLike(X);
+  dScale->ResizeLike(scale);
+  dBias->ResizeLike(scale);
+
+  // dBias = np.sum(dY, axis=0)
+  // dScale = np.sum((X - mean) / inv_std * dy, axis=0)
+  // dX = (1. / N) * scale * inv_var * (N * dY - np.sum(dY, axis=0) - (X - mean)
+  //   * inv_var * inv_var * np.sum(dY * (X - mean), axis=0))
+
+  EigenVectorArrayMap<float> dBias_arr(dBias->mutable_data<float>(), C);
+  EigenVectorArrayMap<float> dScale_arr(dScale->mutable_data<float>(), C);
+
+  dBias_arr.setZero();
+  dScale_arr.setZero();
+
+  const auto scaleInvVarNHW = scale_arr * inv_var_arr / (N * H * W);
+
+  switch (order_) {
+    case StorageOrder::NCHW: {
+      ConstEigenArrayMap<float> X_arr(X.data<float>(), H * W, N * C);
+      ConstEigenArrayMap<float> dY_arr(dY.data<float>(), H * W, N * C);
+      EigenArrayMap<float> dX_arr(dX->mutable_data<float>(), H * W, N * C);
+      dX_arr.setZero();
+
+      for (int nc = 0; nc < N * C; ++nc) {
+        int c = nc % C;
+        dBias_arr(c) += dY_arr.col(nc).sum();
+        dScale_arr(c) +=
+            ((X_arr.col(nc) - mean_arr(c)) * inv_var_arr(c) * dY_arr.col(nc))
+                .sum();
+      }
+      for (int nc = 0; nc < N * C; ++nc) {
+        int c = nc % C;
+        dX_arr.col(nc) += scaleInvVarNHW(c) *
+            (dY_arr.col(nc) * N * H * W - dBias_arr(c) -
+             (X_arr.col(nc) - mean_arr[c]) * dScale_arr(c) * inv_var_arr(c));
+      }
+      break;
+    }
+    case StorageOrder::NHWC: {
+      ConstEigenArrayMap<float> X_arr(X.data<float>(), C, N * H * W);
+      ConstEigenArrayMap<float> dY_arr(dY.data<float>(), C, N * H * W);
+      EigenArrayMap<float> dX_arr(dX->mutable_data<float>(), C, N * H * W);
+      dX_arr.setZero();
+
+      const auto dYRowSum = dY_arr.rowwise().sum();
+      const auto XMinusMean = X_arr.colwise() - mean_arr;
+      const auto dYMulXMinusMeanRowSum = (dY_arr * XMinusMean).rowwise().sum();
+      const auto invVarSqr = inv_var_arr * inv_var_arr;
+      for (int nhw = 0; nhw < N * H * W; ++nhw) {
+        dBias_arr += dY_arr.col(nhw);
+        dScale_arr +=
+            (X_arr.col(nhw) - mean_arr) * inv_var_arr * dY_arr.col(nhw);
+        dX_arr.col(nhw) += scaleInvVarNHW *
+            (dY_arr.col(nhw) * N * H * W - dYRowSum -
+             XMinusMean.col(nhw) * invVarSqr * dYMulXMinusMeanRowSum);
+      }
+      break;
+    }
+    default:
+      CAFFE_THROW("Unknown storage order: ", order_);
+  }
+  return true;
 }
 
 REGISTER_CPU_OPERATOR(SpatialBN, SpatialBNOp<CPUContext>);
@@ -223,7 +305,7 @@
     bool is_test = false;
     if (HasArgument(def_, "is_test")) {
       const auto& arg = GetArgument(def_, "is_test");
-      CHECK(arg.has_i());
+      CAFFE_ENFORCE(arg.has_i());
       is_test = arg.i();
     }
     vector<string> grad_outputs{GI(0), GI(1), GI(2)};
@@ -235,8 +317,7 @@
       //     X, scale, dY, estimated_mean, estimated_variance
       CHECK_EQ(def_.input_size(), 5);
       CHECK_EQ(def_.output_size(), 1);
-      grad_inputs = vector<string>{
-          I(0), I(1), GO(0), I(3), I(4)};
+      grad_inputs = vector<string>{I(0), I(1), GO(0), I(3), I(4)};
     } else {
       CHECK_EQ(def_.input_size(), 5);
       CHECK_EQ(def_.output_size(), 5);
@@ -247,4 +328,4 @@
   }
 };
 REGISTER_GRADIENT(SpatialBN, GetSpatialBNGradient);
-}  // namespace caffe2
+} // namespace caffe2
diff --git a/caffe2/operators/spatial_batch_norm_op.h b/caffe2/operators/spatial_batch_norm_op.h
index 824a8c3..e7435e2 100644
--- a/caffe2/operators/spatial_batch_norm_op.h
+++ b/caffe2/operators/spatial_batch_norm_op.h
@@ -19,7 +19,8 @@
         order_(StringToStorageOrder(
             OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
     // TODO(jiayq): update the input and output size checks.
-    CHECK((is_test_ && OutputSize() == 1) || (!is_test_ && OutputSize() == 5));
+    CAFFE_ENFORCE(
+        (is_test_ && OutputSize() == 1) || (!is_test_ && OutputSize() == 5));
     CHECK_GT(epsilon_, 0);
     CHECK_GE(momentum_, 0);
     CHECK_LE(momentum_, 1);
@@ -49,8 +50,8 @@
         epsilon_(OperatorBase::GetSingleArgument<float>("epsilon", 1e-5)),
         order_(StringToStorageOrder(
             OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
-    CHECK(InputSize() == 5);
-    CHECK_EQ(OutputSize(), 3);
+    CAFFE_ENFORCE(InputSize() == 5);
+    CAFFE_ENFORCE(OutputSize() == 3);
   }
   ~SpatialBNGradientOp() {}
 
diff --git a/caffe2/operators/square_root_divide_op.cc b/caffe2/operators/square_root_divide_op.cc
new file mode 100644
index 0000000..375937b
--- /dev/null
+++ b/caffe2/operators/square_root_divide_op.cc
@@ -0,0 +1,45 @@
+#include "caffe2/operators/square_root_divide_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(
+    SquareRootDivide,
+    SquareRootDivideOp<int32_t, CPUContext>);
+OPERATOR_SCHEMA(SquareRootDivide)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .SetDoc(R"DOC(
+Given DATA tensor with first dimention N and SCALE vector of the same size N
+produces an output tensor with same dimensions as DATA. Which consists of DATA
+slices. i-th slice is divided by sqrt(SCALE[i]) elementwise. If SCALE[i] == 0
+output slice is identical to the input one (no scaling)
+
+Example:
+
+  Data = [
+    [1.0, 2.0],
+    [3.0, 4.0]
+  ]
+
+  SCALE = [4, 9]
+
+  OUTPUT = [
+    [2.0, 4.0],
+    [9.0, 12.0]
+  ]
+
+)DOC");
+
+class GetSquareRootDivideGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "SquareRootDivide",
+        "",
+        vector<string>{GO(0), I(1)},
+        vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(SquareRootDivide, GetSquareRootDivideGradient);
+} // namespace caffe2
diff --git a/caffe2/operators/square_root_divide_op.h b/caffe2/operators/square_root_divide_op.h
new file mode 100644
index 0000000..8d5d908
--- /dev/null
+++ b/caffe2/operators/square_root_divide_op.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename TScale, class Context>
+class SquareRootDivideOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_DISPATCH_HELPER;
+
+  SquareRootDivideOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<float, double>>::call(this, Input(DATA));
+  }
+
+ private:
+  template <typename TData>
+  bool DoRunWithType() {
+    auto& data = Input(DATA);
+    auto& scale = Input(SCALE);
+    auto* Y = Output(0);
+    Y->ResizeLike(data);
+    size_t batchSize = data.dim(0);
+    size_t exampleSize = data.size_from_dim(1);
+    CAFFE_ENFORCE(batchSize == scale.dim(0), batchSize, " != ", scale.dim(0));
+    auto* scalePtr = scale.template data<TScale>();
+    auto* dataPtr = data.template data<TData>();
+    auto* yPtr = Y->template mutable_data<TData>();
+    for (int i = 0; i < batchSize; ++i) {
+      auto scale = scalePtr[i];
+      CAFFE_ENFORCE(scale >= 0, scale, " < 0");
+      auto multiplier = scale == 0 ? 1.0 : 1 / std::sqrt(scale);
+      math::Scale<TData, Context>(
+          exampleSize,
+          multiplier,
+          dataPtr + i * exampleSize,
+          yPtr + i * exampleSize,
+          &context_);
+    }
+    return true;
+  }
+
+  INPUT_TAGS(DATA, SCALE);
+};
+
+} // namespace caffe2
diff --git a/caffe2/operators/summarize_op.h b/caffe2/operators/summarize_op.h
index fe4d2a7..86d5e9c 100644
--- a/caffe2/operators/summarize_op.h
+++ b/caffe2/operators/summarize_op.h
@@ -24,9 +24,12 @@
       log_file_.reset(new std::ofstream(
           target_folder + "/" + def.input(0) + kSummaryzeOpExtension,
           std::ofstream::out | std::ofstream::trunc));
-      CHECK(log_file_->good())
-          << "Failed to open summarize file for tensor " << def.input(0)
-          << ". rdstate() = " << log_file_->rdstate();
+      CAFFE_ENFORCE(
+          log_file_->good(),
+          "Failed to open summarize file for tensor ",
+          def.input(0),
+          ". rdstate() = ",
+          log_file_->rdstate());
     }
   }
   ~SummarizeOp() { if (to_file_) log_file_->close(); }
diff --git a/caffe2/operators/tensor_protos_db_input.h b/caffe2/operators/tensor_protos_db_input.h
index 7b7ca3f..65e7fae 100644
--- a/caffe2/operators/tensor_protos_db_input.h
+++ b/caffe2/operators/tensor_protos_db_input.h
@@ -50,13 +50,13 @@
     // deserialize everything into the target prefetched blob.
     reader.Read(&key_, &value_);
     TensorProtos protos;
-    CHECK(protos.ParseFromString(value_));
-    CHECK_EQ(protos.protos_size(), OutputSize());
+    CAFFE_ENFORCE(protos.ParseFromString(value_));
+    CAFFE_ENFORCE(protos.protos_size() == OutputSize());
     for (int i = 0; i < protos.protos_size(); ++i) {
       if (protos.protos(i).has_device_detail()) {
         protos.mutable_protos(i)->clear_device_detail();
       }
-      CHECK(deserializer.Deserialize(
+      CAFFE_ENFORCE(deserializer.Deserialize(
           protos.protos(i),
           prefetched_blobs_[i].template GetMutable<TensorCPU>()));
     }
@@ -65,8 +65,8 @@
     for (int item_id = 0; item_id < batch_size_; ++item_id) {
       reader.Read(&key_, &value_);
       TensorProtos protos;
-      CHECK(protos.ParseFromString(value_));
-      CHECK_EQ(protos.protos_size(), OutputSize());
+      CAFFE_ENFORCE(protos.ParseFromString(value_));
+      CAFFE_ENFORCE(protos.protos_size() == OutputSize());
       if (!shape_inferred_) {
         // First, set the shape of all the blobs.
         for (int i = 0; i < protos.protos_size(); ++i) {
@@ -82,7 +82,7 @@
         if (protos.protos(i).has_device_detail()) {
           protos.mutable_protos(i)->clear_device_detail();
         }
-        CHECK(deserializer.Deserialize(protos.protos(i), &src));
+        CAFFE_ENFORCE(deserializer.Deserialize(protos.protos(i), &src));
         DCHECK_EQ(src.size() * batch_size_, dst->size());
         this->context_.template CopyItems<CPUContext, CPUContext>(
             src.meta(),
diff --git a/caffe2/operators/text_file_reader.cc b/caffe2/operators/text_file_reader.cc
index e74ab06..6c3d038 100644
--- a/caffe2/operators/text_file_reader.cc
+++ b/caffe2/operators/text_file_reader.cc
@@ -1,6 +1,7 @@
 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/core/tensor.h"
+#include "caffe2/operators/text_file_reader_utils.h"
 #include "caffe2/utils/string_utils.h"
 
 namespace caffe2 {
@@ -154,6 +155,8 @@
   TIndex batchSize_;
 };
 
+CAFFE_KNOWN_TYPE(std::unique_ptr<TextFileReaderInstance>);
+
 REGISTER_CPU_OPERATOR(CreateTextFileReader, CreateTextFileReaderOp);
 REGISTER_CPU_OPERATOR(TextFileReaderRead, TextFileReaderReadOp);
 
diff --git a/caffe2/operators/text_file_reader_utils.cc b/caffe2/operators/text_file_reader_utils.cc
new file mode 100644
index 0000000..0264f0e
--- /dev/null
+++ b/caffe2/operators/text_file_reader_utils.cc
@@ -0,0 +1,118 @@
+#include "caffe2/operators/text_file_reader_utils.h"
+
+#include <fcntl.h>
+#include <unistd.h>
+#include <cstring>
+#include <sstream>
+
+namespace caffe2 {
+
+Tokenizer::Tokenizer(const std::vector<char>& delims, char escape)
+    : escape_(escape) {
+  reset();
+  std::memset(delimTable_, 0, sizeof(delimTable_));
+  for (int i = 0; i < delims.size(); ++i) {
+    delimTable_[(unsigned char)delims.at(i)] = i + 1;
+  }
+}
+
+void Tokenizer::reset() {
+  toBeSkipped_ = 0;
+  startDelimId_ = 0;
+  leftover_.clear();
+}
+
+void Tokenizer::next(char* start, char* end, TokenizedString& tokenized) {
+  tokenized.modifiedStrings_.clear();
+  tokenized.tokens_.clear();
+
+  char* currentStart = start;
+  std::string* copied = nullptr;
+  if (!leftover_.empty()) {
+    tokenized.modifiedStrings_.emplace_back(new std::string());
+    copied = tokenized.modifiedStrings_.back().get();
+    *copied = std::move(leftover_);
+  }
+
+  char* ch;
+  for (ch = start + toBeSkipped_; ch < end; ++ch) {
+    if (*ch == escape_) {
+      if (!copied) {
+        tokenized.modifiedStrings_.emplace_back(new std::string());
+        copied = tokenized.modifiedStrings_.back().get();
+      }
+      copied->append(currentStart, ch);
+      currentStart = ch + 1;
+      // skip next character, since it's escaped
+      ++ch;
+      continue;
+    }
+    int newDelimId = delimTable_[(unsigned char)*ch];
+    if (newDelimId > 0) {
+      // found delimiter
+      tokenized.tokens_.emplace_back();
+      auto& token = tokenized.tokens_.back();
+      token.startDelimId = startDelimId_;
+      if (copied) {
+        copied->append(currentStart, ch);
+        const char* c_str = copied->data();
+        token.start = c_str;
+        token.end = c_str + copied->size();
+      } else {
+        token.start = currentStart;
+        token.end = ch;
+      }
+      currentStart = ch + 1;
+      copied = nullptr;
+      startDelimId_ = newDelimId - 1;
+    }
+  }
+  tokenized.lastDelim_ = startDelimId_;
+
+  toBeSkipped_ = ch - end;
+  if (copied) {
+    copied->append(currentStart, end);
+    leftover_ = std::move(*copied);
+  } else {
+    leftover_.assign(currentStart, end);
+  }
+}
+
+FileReader::FileReader(const std::string& path, size_t bufferSize)
+    : bufferSize_(bufferSize), buffer_(new char[bufferSize]) {
+  fd_ = open(path.c_str(), O_RDONLY, 0777);
+  if (fd_ < 0) {
+    throw std::runtime_error(
+        "Error opening file for reading: " + std::string(std::strerror(errno)));
+  }
+}
+
+void FileReader::reset() {
+  if (lseek(fd_, 0, SEEK_SET) == -1) {
+    throw std::runtime_error(
+        "Error reseting file cursor: " + std::string(std::strerror(errno)));
+  }
+}
+
+FileReader::~FileReader() {
+  if (fd_ >= 0) {
+    close(fd_);
+  }
+}
+
+void FileReader::operator()(CharRange& range) {
+  char* buffer = buffer_.get();
+  auto numRead = read(fd_, buffer, bufferSize_);
+  if (numRead == -1) {
+    throw std::runtime_error(
+        "Error reading file: " + std::string(std::strerror(errno)));
+  }
+  if (numRead == 0) {
+    range.start = nullptr;
+    range.end = nullptr;
+    return;
+  }
+  range.start = buffer;
+  range.end = buffer + numRead;
+}
+}
diff --git a/caffe2/operators/text_file_reader_utils.h b/caffe2/operators/text_file_reader_utils.h
new file mode 100644
index 0000000..17e888a
--- /dev/null
+++ b/caffe2/operators/text_file_reader_utils.h
@@ -0,0 +1,116 @@
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace caffe2 {
+
+struct Token {
+  int startDelimId;
+  const char* start;
+  const char* end;
+};
+
+class TokenizedString {
+  // holder for strings that have been modified
+  std::vector<std::unique_ptr<std::string>> modifiedStrings_;
+  std::vector<Token> tokens_;
+  int lastDelim_;
+
+ public:
+  const std::vector<Token>& tokens() const {
+    return tokens_;
+  }
+  const int lastDelim() const {
+    return lastDelim_;
+  }
+  friend class Tokenizer;
+};
+
+class Tokenizer {
+ private:
+  int startDelimId_;
+  // state of the tokenizer
+  std::string leftover_;
+  // if we need to skip the first characters of the next batch because
+  // e.g. a escape char that was the last character of the last batch.
+  int toBeSkipped_;
+  int delimTable_[256];
+  const char escape_;
+
+ public:
+  Tokenizer(const std::vector<char>& delimiters, char escape);
+  void reset();
+  void next(char* start, char* end, TokenizedString& tokenized);
+};
+
+struct CharRange {
+  char* start;
+  char* end;
+};
+
+struct StringProvider {
+  virtual void operator()(CharRange&) = 0;
+  virtual void reset() = 0;
+  virtual ~StringProvider() {}
+};
+
+class BufferedTokenizer {
+ public:
+  BufferedTokenizer(const Tokenizer& t, StringProvider* p, int numPasses = 1)
+      : provider_(p), tokenizer_(t), tokenIndex_(0), numPasses_(numPasses) {}
+
+  bool next(Token& token) {
+    CharRange range;
+    while (tokenIndex_ >= tokenized_.tokens().size()) {
+      range.start = nullptr;
+      while (range.start == nullptr && pass_ < numPasses_) {
+        (*provider_)(range);
+        if (range.start == nullptr) {
+          ++pass_;
+          if (pass_ < numPasses_) {
+            provider_->reset();
+            tokenizer_.reset();
+          }
+        }
+      }
+      if (range.start == nullptr) {
+        return false;
+      }
+      tokenizer_.next(range.start, range.end, tokenized_);
+      tokenIndex_ = 0;
+    }
+    token = tokenized_.tokens()[tokenIndex_++];
+    return true;
+  };
+
+  int endDelim() const {
+    if (tokenIndex_ + 1 < tokenized_.tokens().size()) {
+      return tokenized_.tokens()[tokenIndex_ + 1].startDelimId;
+    }
+    return tokenized_.lastDelim();
+  }
+
+ private:
+  StringProvider* provider_;
+  Tokenizer tokenizer_;
+  TokenizedString tokenized_;
+  int tokenIndex_;
+  int numPasses_;
+  int pass_{0};
+};
+
+class FileReader : public StringProvider {
+ public:
+  explicit FileReader(const std::string& path, size_t bufferSize = 65536);
+  ~FileReader();
+  void operator()(CharRange& range) override;
+  void reset() override;
+
+ private:
+  const size_t bufferSize_;
+  int fd_;
+  std::unique_ptr<char[]> buffer_;
+};
+}
diff --git a/caffe2/utils/string_utils_test.cc b/caffe2/operators/text_file_reader_utils_test.cc
similarity index 97%
rename from caffe2/utils/string_utils_test.cc
rename to caffe2/operators/text_file_reader_utils_test.cc
index ab3e247..c2a8799 100644
--- a/caffe2/utils/string_utils_test.cc
+++ b/caffe2/operators/text_file_reader_utils_test.cc
@@ -6,6 +6,7 @@
 #include "caffe2/utils/math.h"
 #include "gtest/gtest.h"
 
+#include "caffe2/operators/text_file_reader_utils.h"
 #include "caffe2/utils/string_utils.h"
 
 #include <cstdio>
@@ -13,7 +14,7 @@
 
 namespace caffe2 {
 
-TEST(StringTest, TokenizeTest) {
+TEST(TextFileReaderUtilsTest, TokenizeTest) {
   TokenizedString tokenized;
   std::string ch =
       "label\1text\xc3\xbf\nlabel2\\\nTest\1tex\\\\t2\n"
diff --git a/caffe2/operators/utility_ops.cc b/caffe2/operators/utility_ops.cc
index 4f744d7..4e35918 100644
--- a/caffe2/operators/utility_ops.cc
+++ b/caffe2/operators/utility_ops.cc
@@ -6,6 +6,7 @@
 REGISTER_CPU_OPERATOR(WallClockTime, WallClockTimeOp<CPUContext>);
 REGISTER_CPU_OPERATOR(Print, PrintOp<CPUContext>);
 REGISTER_CPU_OPERATOR(Flatten, FlattenOp<CPUContext>);
+REGISTER_CPU_OPERATOR(FlattenToVec, FlattenToVecOp<CPUContext>);
 REGISTER_CPU_OPERATOR(Alias, AliasOp<CPUContext>);
 REGISTER_CPU_OPERATOR(ResizeLike, ResizeLikeOp<CPUContext>);
 REGISTER_CPU_OPERATOR(Sum, SumOp<float, CPUContext>);
@@ -16,8 +17,13 @@
     ScatterWeightedSumOp<float, CPUContext>);
 REGISTER_CPU_OPERATOR(ScatterAssign, ScatterAssignOp<float, CPUContext>);
 // From whatever the current context, ensure the output is TensorCPU
-REGISTER_CPU_OPERATOR(EnsureCPUOutput,
-                      CopyOp<CPUContext, CPUContext, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    EnsureCPUOutput,
+    CopyOp<CPUContext, CPUContext, CPUContext>);
+// From CPU, copy it to whatever the current context
+REGISTER_CPU_OPERATOR(
+    CopyFromCPUInput,
+    CopyOp<CPUContext, CPUContext, CPUContext>);
 REGISTER_CPU_OPERATOR(Copy, CopyOp<CPUContext, CPUContext, CPUContext>);
 REGISTER_CPU_OPERATOR(Shape, ShapeOp<CPUContext>);
 REGISTER_CPU_OPERATOR(Reshape, ReshapeOp<float, CPUContext>);
@@ -25,6 +31,7 @@
 REGISTER_CPU_OPERATOR(HasElements, HasElementsOp<CPUContext>);
 REGISTER_CPU_OPERATOR(IsEmpty, IsEmptyOp<CPUContext>);
 REGISTER_CPU_OPERATOR(Gather, GatherOp<CPUContext>);
+REGISTER_CPU_OPERATOR(GatherRanges, GatherRangesOp<CPUContext>);
 REGISTER_CPU_OPERATOR(Unique, UniqueOp<CPUContext>);
 REGISTER_CPU_OPERATOR(LengthsToSegmentIds, LengthsToSegmentIdsOp<CPUContext>);
 REGISTER_CPU_OPERATOR(LengthsToRanges, LengthsToRangesOp<CPUContext>);
@@ -32,6 +39,9 @@
 REGISTER_CPU_OPERATOR(Slice, SliceOp<int, CPUContext>);
 REGISTER_CPU_OPERATOR(Squeeze, SqueezeOp<CPUContext>);
 REGISTER_CPU_OPERATOR(ExpandDims, ExpandDimsOp<CPUContext>);
+REGISTER_CPU_OPERATOR(
+    SegmentIdsToLengthWeights,
+    SegmentIdsToLengthWeightsOp<CPUContext>);
 
 OPERATOR_SCHEMA(WallClockTime)
     .NumInputs(0)
@@ -104,6 +114,18 @@
         "with first dimension equal first dimension of input, and remaining "
         "input dimensions flatenned into the inner dimension of the output.");
 
+OPERATOR_SCHEMA(FlattenToVec)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Flattens the input tensor into a 1D vector.
+)DOC")
+    .Input(0, "input", "A tensor of rank >= 1.")
+    .Output(
+        0,
+        "output",
+        "A tensor of rank 1 with the contents of the input tensor");
+
 OPERATOR_SCHEMA(Alias)
     .NumInputs(1)
     .NumOutputs(1)
@@ -257,6 +279,16 @@
     .Input(0, "input", "The input CUDA or CPU tensor.")
     .Output(0, "output", "TensorCPU that is a copy of the input.");
 
+OPERATOR_SCHEMA(CopyFromCPUInput)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Take a CPU input tensor and copy it to an output in the current
+Context (GPU or CPU). This may involves cross-device MemCpy.
+)DOC")
+    .Input(0, "input", "The input CPU tensor.")
+    .Output(0, "output", "either a TensorCUDA or a TensorCPU");
+
 OPERATOR_SCHEMA(Shape)
     .NumInputs(1)
     .NumOutputs(1)
@@ -312,6 +344,49 @@
     .Input(1, "INDICES", "Tensor of int32/int64 indices, of any rank q.")
     .Output(0, "OUTPUT", "Tensor of rank q + (r - 1).");
 
+OPERATOR_SCHEMA(GatherRanges)
+    .NumInputs(2)
+    .NumOutputs(2)
+    .SetDoc(R"DOC(
+Given DATA tensor of rank 1, and RANGES tensor of rank 3, gather
+corresponding ranges into a 1-D tensor OUTPUT.
+
+RANGES dimentions description:
+1: represents list of examples within a batch
+2: represents list features
+3: two values which are start and length or a range (to be applied on DATA)
+
+Another output LENGTHS represents each example length within OUTPUT
+
+Example:
+  DATA  = [1, 2, 3, 4, 5, 6]
+  RANGES = [
+    [
+      [0, 1],
+      [2, 2],
+    ],
+    [
+      [4, 1],
+      [5, 1],
+    ]
+  ]
+  OUTPUT = [1, 3, 4, 5, 6]
+  LENGTHS = [3, 2]
+)DOC")
+    .Input(0, "DATA", "Tensor of rank 1.")
+    .Input(
+        1,
+        "RANGES",
+        "Tensor of int32/int64 ranges, of dims (N, M, 2). "
+        "Where N is number of examples and M is a size of each example. "
+        "Last dimention represents a range in the format (start, lengths)")
+    .Output(0, "OUTPUT", "1-D tensor of size sum of range lengths")
+    .Output(
+        1,
+        "LENGTHS",
+        "1-D tensor of size N with lengths over gathered data"
+        " for each row in a batch. sum(LENGTHS) == OUTPUT.size()");
+
 OPERATOR_SCHEMA(Unique)
     .NumInputs(1)
     .NumOutputs(1, 2)
@@ -367,6 +442,20 @@
     .Input(0, "segment_ids", "1-D int32_t or int64_t tensor of segment ids")
     .Output(0, "lengths", "1-D int64_t tensor of segment lengths");
 
+OPERATOR_SCHEMA(SegmentIdsToLengthWeights)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .Arg("power", "n of 1/pow(length,n) for normalization")
+    .SetDoc(
+        R"DOC( Similar as SegmentIdsToLengths but output vector of segment
+weights derived by lengths. i.e 1/pow(length, power)
+)DOC")
+    .Input(0, "segment_ids", "1-D int32_t or int64_t tensor of segment ids")
+    .Output(
+        0,
+        "a vector of weights",
+        "1-D float tensor of segment weights by length");
+
 OPERATOR_SCHEMA(Slice)
     .NumInputs(3)
     .NumOutputs(1)
@@ -403,8 +492,7 @@
     .AllowInplace({{0, 0}})
     .SetDoc(R"DOC(
 Remove single-dimensional entries from the shape of a tensor.
-Takes an optional parameter `dims` with a list of dimension to squeeze.
-If `dims` is not provided, all singleton dimensions are squeezed.
+Takes a  parameter `dims` with a list of dimension to squeeze.
 If the same blob is provided in input and output, the operation is copy-free.
 This is the exact inverse operation of ExpandDims given the same `dims` arg.
 )DOC")
@@ -522,8 +610,10 @@
 SHOULD_NOT_DO_GRADIENT(Unique);
 SHOULD_NOT_DO_GRADIENT(LengthsToSegmentIds);
 SHOULD_NOT_DO_GRADIENT(SegmentIdsToLengths);
+SHOULD_NOT_DO_GRADIENT(SegmentIdsToLengthWeights);
 // TODO(azzolini): Add support for slice gradient
 SHOULD_NOT_DO_GRADIENT(Slice);
+SHOULD_NOT_DO_GRADIENT(GatherRangesOp);
 
 } // namespace
 
diff --git a/caffe2/operators/utility_ops.h b/caffe2/operators/utility_ops.h
index b7ca5ba..3b4486f 100644
--- a/caffe2/operators/utility_ops.h
+++ b/caffe2/operators/utility_ops.h
@@ -54,9 +54,12 @@
       log_file_.reset(new std::ofstream(
           target_folder + "/" + def().input(0) + kPrintFileExtension,
           std::ofstream::out | std::ofstream::trunc));
-      CHECK(log_file_->good()) << "Failed to open PrintOp file for tensor "
-                               << def().input(0)
-                               << ". rdstate() = " << log_file_->rdstate();
+      CAFFE_ENFORCE(
+          log_file_->good(),
+          "Failed to open PrintOp file for tensor ",
+          def().input(0),
+          ". rdstate() = ",
+          log_file_->rdstate());
     }
   }
 
@@ -201,6 +204,27 @@
   }
 };
 
+template <class Context>
+class FlattenToVecOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(FlattenToVecOp);
+
+  bool RunOnDevice() override {
+    auto& input = Input(0);
+    auto* output = Output(0);
+    DCHECK_GT(input.size(), 0);
+    output->Resize(input.size());
+
+    context_.template CopyItems<Context, Context>(
+        input.meta(),
+        input.size(),
+        input.raw_data(),
+        output->raw_mutable_data(input.meta()));
+    return true;
+  }
+};
+
 // Output gets the data of input(0), but reshapes it like input(1).
 template <class Context>
 class ResizeLikeOp : public Operator<Context> {
@@ -240,10 +264,16 @@
     T* output_data = output->template mutable_data<T>();
     // Dimension checking
     for (int i = 1; i < InputSize(); ++i) {
-      CHECK(output->dims() == Input(i).dims())
-          << ProtoDebugString(def()) << "\n"
-          << output->dims() << "\n"
-          << "Input " << i << ": " << Input(i).dims();
+      if (output->dims() != Input(i).dims()) {
+        CAFFE_THROW(
+            "Check failed: output->dims() == Input(i).dims().",
+            "Description: Input #",
+            i,
+            ", input dimension:",
+            Input(i).dims(),
+            " should match output dimension: ",
+            output->dims());
+      }
     }
 
     // Add the first two - works if in-place or not.
@@ -532,14 +562,9 @@
   USE_SIMPLE_CTOR_DTOR(LengthsToSegmentIdsOp);
 
   bool RunOnDevice() override {
-    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(0));
-  }
-
-  template <typename Index>
-  bool DoRunWithType() {
     auto& input = Input(0);
     auto* output = Output(0);
-    auto* input_data = input.template data<Index>();
+    auto* input_data = input.template data<int32_t>();
 
     CAFFE_ENFORCE(input.dims().size() == 1, "Input must be a vector.");
     auto total_length =
@@ -564,22 +589,17 @@
   USE_SIMPLE_CTOR_DTOR(LengthsToRangesOp);
 
   bool RunOnDevice() override {
-    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(0));
-  }
-
-  template <typename Index>
-  bool DoRunWithType() {
     auto& input = Input(0);
     auto* output = Output(0);
-    auto* input_data = input.template data<Index>();
+    auto* input_data = input.template data<int32_t>();
 
     CAFFE_ENFORCE(input.dims().size() == 1, "Input must be a vector.");
     auto size = input.size();
 
     output->Resize(size, 2);
-    auto* output_data = output->template mutable_data<Index>();
+    auto* output_data = output->template mutable_data<int32_t>();
 
-    Index offset = 0;
+    int32_t offset = 0;
     for (int i = 0; i < size; ++i) {
       auto len = input_data[i];
       output_data[i * 2] = offset;
@@ -611,7 +631,7 @@
     auto num_segments = input_size ? input_data[input_size - 1] + 1 : 0;
     CAFFE_ENFORCE(0 <= num_segments, "Indices must be in 0..K-1 range");
     output->Resize(num_segments);
-    auto* output_data = output->template mutable_data<int64_t>();
+    auto* output_data = output->template mutable_data<int32_t>();
     if (num_segments == 0) {
       return true;
     }
@@ -632,6 +652,83 @@
   }
 };
 
+template <class Context>
+class SegmentIdsToLengthWeightsOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  SegmentIdsToLengthWeightsOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        power_(OperatorBase::GetSingleArgument<float>("power", 0.5)) {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(0));
+  }
+
+  template <typename Index>
+  bool DoRunWithType() {
+    auto& input = Input(0);
+    CAFFE_ENFORCE(input.dims().size() == 1, "Input must be a vector.");
+    auto* input_data = input.template data<Index>();
+    auto input_size = input.size();
+    auto* output = Output(0);
+
+    // segment id starts from 0
+    auto num_segments = input_size ? input_data[input_size - 1] + 1 : 0;
+    CAFFE_ENFORCE(0 <= num_segments, "Indices must be in 0..K-1 range");
+
+    std::vector<int64_t> seg_lengths(num_segments, 0);
+
+    output->Resize(input_size);
+    auto* output_data = output->template mutable_data<float>();
+    if (num_segments == 0) {
+      return true;
+    }
+    std::fill(output_data, output_data + num_segments, 0);
+
+    Index prev = input_data[0];
+    for (int64_t i = 0; i < input_size; i++) {
+      CAFFE_ENFORCE(
+          prev == input_data[i] || prev + 1 == input_data[i],
+          "Segment ids must be sorted and at least size 1: ",
+          prev,
+          " vs ",
+          input_data[i]);
+      prev = input_data[i];
+      seg_lengths[input_data[i]] += 1;
+    }
+
+    int64_t in = 0;
+
+    std::function<float(const int64_t& length, const float& power)> getWeight;
+
+    if (power_ == 0.5) {
+      getWeight = [](const int64_t& length, const float& power) {
+        return 1.0 / sqrt(length);
+      };
+    } else if (power_ == 1) {
+      getWeight = [](const int64_t& length, const float& power) {
+        return 1.0 / length;
+      };
+    } else {
+      getWeight = [](const int64_t& length, const float& power) {
+        return 1.0 / pow(length, power);
+      };
+    }
+
+    for (int64_t i = 0; i < num_segments; i++) {
+      float weight = getWeight(seg_lengths[i], power_);
+      for (int64_t j = 0; j < seg_lengths[i]; j++) {
+        output_data[in++] = weight;
+      }
+    }
+
+    return true;
+  }
+
+ private:
+  float power_;
+};
+
 template <class SIndex, class Context>
 class SliceOp : public Operator<Context> {
  public:
@@ -848,12 +945,23 @@
     if (unknown_idx != -1) {
       CAFFE_ENFORCE(
           total_size % size == 0,
-          "Argument `shape` does not agree with the input data.");
+          "Argument `shape` does not agree with the input data.",
+          " (",
+          total_size,
+          " vs ",
+          size,
+          ")");
       new_shape_[unknown_idx] = total_size / size;
     } else {
-      CAFFE_ENFORCE(
-          total_size == size,
-          "Argument `shape` does not agree with the input data.");
+      CAFFE_ENFORCE_EQ(
+          total_size,
+          size,
+          "Argument `shape` does not agree with the input data.",
+          " (",
+          total_size,
+          " != ",
+          size,
+          ")");
     }
 
     // Write the original shape to the second output.
@@ -887,16 +995,11 @@
   USE_SIMPLE_CTOR_DTOR(LengthsToShapeOp);
 
   bool RunOnDevice() override {
-    return DispatchHelper<TensorTypes<int, long>>::call(this, Input(0));
-  }
-
-  template <typename T>
-  bool DoRunWithType() {
     auto& input = Input(0);
 
     CAFFE_ENFORCE(input.dims().size() == 1, "Input must be a vector.");
     auto* output = Output(0);
-    auto* input_data = input.template data<T>();
+    auto* input_data = input.template data<int32_t>();
 
     auto size = input.size();
     auto first = input_data[0];
@@ -907,7 +1010,7 @@
     }
 
     output->Resize(2);
-    auto* output_data = output->template mutable_data<T>();
+    auto* output_data = output->template mutable_data<int32_t>();
     output_data[0] = size;
     output_data[1] = first;
 
@@ -923,13 +1026,14 @@
       : Operator<Context>(operator_def, ws),
         dims_(OperatorBase::GetRepeatedArgument<int>("dims")) {
     auto originalSize = dims_.size();
+    CAFFE_ENFORCE(originalSize > 0, "Parameter `dims` must be provided.");
+
     std::sort(dims_.begin(), dims_.end());
     std::unique(dims_.begin(), dims_.end());
     if (dims_.size() < originalSize) {
       LOG(WARNING) << "Parameter `dims` has repeated dimensions.";
     }
-    CHECK(dims_.empty() || dims_.front() >= 0)
-        << "Dimension ids must be non-negative.";
+    CAFFE_ENFORCE(dims_.front() >= 0, "Dimension ids must be non-negative.");
   }
 
   bool RunOnDevice() override {
@@ -947,13 +1051,11 @@
     for (int i = 0; i < input.dims().size(); ++i) {
       if (j < dims_.size() && dims_[j] == i) {
         CAFFE_ENFORCE(
-            input.dims()[i] == 1, "Dimension ", i, " of input must be 1.");
+            input.dims()[i] == 1, "Dimension ", i, " of input must be 1",
+            " instead of ", input.dims()[i], ".");
         ++j;
         continue;
-      } else if (dims_.empty() && input.dim(i) == 1) {
-        continue;
       }
-
       newDims.push_back(input.dims().at(i));
     }
     output->Reshape(newDims);
@@ -975,13 +1077,13 @@
       : Operator<Context>(operator_def, ws),
         dims_(OperatorBase::GetRepeatedArgument<int>("dims")) {
     auto originalSize = dims_.size();
+    CAFFE_ENFORCE(originalSize > 0, "Parameter `dims` must be provided.");
     std::sort(dims_.begin(), dims_.end());
     std::unique(dims_.begin(), dims_.end());
     if (dims_.size() < originalSize) {
       LOG(WARNING) << "Parameter `dims` has repeated dimensions.";
     }
-    CHECK(dims_.empty() || dims_.front() >= 0)
-        << "Dimension ids must be non-negative.";
+    CAFFE_ENFORCE(dims_.front() >= 0, "Dimension ids must be non-negative.");
   }
 
   bool RunOnDevice() override {
@@ -1020,7 +1122,7 @@
 
   template <typename Index>
   bool DoRunWithType() {
-    // If we endup using it on GPU doint O(N) memcpy is probably not best :)
+    // If we endup using it on GPU doing O(N) memcpy is probably not best :)
     // TODO: implement prefetching if it starts mattering (TF does it)
     auto& data = Input(DATA);
     auto& indices = Input(INDICES);
@@ -1053,6 +1155,86 @@
   INPUT_TAGS(DATA, INDICES);
 };
 
+template <class Context>
+class GatherRangesOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(GatherRangesOp);
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
+        this, OperatorBase::Input<TensorCPU>(RANGES));
+  }
+
+  template <typename Index>
+  bool DoRunWithType() {
+    auto& data = Input(DATA);
+    auto& ranges = Input(RANGES);
+    auto* outputData = Output(0);
+    auto* outputLengths = Output(1);
+
+    auto batchSize = ranges.dim(0);
+    CAFFE_ENFORCE(data.ndim() == 1, "Data has to be 1-D");
+    CAFFE_ENFORCE(ranges.ndim() == 3, "Ranges must be 3-D");
+    CAFFE_ENFORCE(batchSize > 0, "Batch of examples can't be empty");
+    CAFFE_ENFORCE(ranges.dim(1) > 0, "There has to be at least one range");
+    CAFFE_ENFORCE(ranges.dim(2), "Ranges last dimention should be of size 2");
+
+    auto* rawData = static_cast<const char*>(data.raw_data());
+    auto* rangesData = ranges.template data<Index>();
+
+    outputLengths->Resize(batchSize);
+    auto* outputLengthsPtr = outputLengths->template mutable_data<int32_t>();
+    size_t start = 0;
+    size_t blockSize = ranges.size() / batchSize;
+    for (size_t i = 0; i < batchSize; ++i) {
+      auto end = start + blockSize;
+      outputLengthsPtr[i] = accumulate(rangesData, start, end);
+      start = end;
+    }
+
+    size_t outputSize = accumulate(rangesData, 0, ranges.size());
+    outputData->Resize(outputSize);
+
+    auto outputRawData =
+        static_cast<char*>(outputData->raw_mutable_data(data.meta()));
+    VLOG(1) << "Copying data";
+    size_t outputOffsetBytes = 0;
+    auto itemsize = data.meta().itemsize();
+    for (int i = 0; i < ranges.size(); i += 2) {
+      auto rangeStart = rangesData[i];
+      auto rangeLength = rangesData[i + 1];
+      if (!rangeLength) {
+        continue;
+      }
+      auto rangeSizeBytes = rangeLength * itemsize;
+      CAFFE_ENFORCE(outputOffsetBytes < outputSize * itemsize);
+      CAFFE_ENFORCE(rangeStart + rangeLength <= data.size());
+      VLOG(2) << "Performing copy for range i";
+      context_.template CopyItems<Context, Context>(
+          data.meta(),
+          rangeLength,
+          rawData + rangeStart * itemsize,
+          outputRawData + outputOffsetBytes);
+      outputOffsetBytes += rangeSizeBytes;
+    }
+    CAFFE_ENFORCE(outputOffsetBytes == outputSize * itemsize);
+    return true;
+  }
+
+  INPUT_TAGS(DATA, RANGES, LENGTHS);
+
+ private:
+  template <typename Index>
+  size_t accumulate(Index* ranges, size_t start, size_t end) {
+    size_t result = 0;
+    for (int i = start + 1; i < end; i += 2) {
+      result += ranges[i];
+    }
+    return result;
+  }
+};
+
 // Since we just do copying, consider untemplating it on T and using raw_data()
 /**
  * Deduplicates input indices vector and optionally produces reverse remapping.
diff --git a/caffe2/operators/utility_ops_gpu.cc b/caffe2/operators/utility_ops_gpu.cc
index eed7ad3..270a1a3 100644
--- a/caffe2/operators/utility_ops_gpu.cc
+++ b/caffe2/operators/utility_ops_gpu.cc
@@ -6,23 +6,31 @@
 
 REGISTER_CUDA_OPERATOR(Print, PrintOp<CUDAContext>);
 REGISTER_CUDA_OPERATOR(Flatten, FlattenOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(FlattenToVec, FlattenToVecOp<CUDAContext>);
 REGISTER_CUDA_OPERATOR(Alias, AliasOp<CUDAContext>);
 REGISTER_CUDA_OPERATOR(ResizeLike, ResizeLikeOp<CUDAContext>);
 REGISTER_CUDA_OPERATOR(Sum, SumOp<float, CUDAContext>);
 REGISTER_CUDA_OPERATOR(WeightedSum, WeightedSumOp<float, CUDAContext>);
 // From whatever the current context, ensure the output is TensorCPU
-REGISTER_CUDA_OPERATOR(EnsureCPUOutput,
-                       CopyOp<CUDAContext, CPUContext, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    EnsureCPUOutput,
+    CopyOp<CUDAContext, CPUContext, CUDAContext>);
+// From CPU, copy it to whatever the current context
+REGISTER_CUDA_OPERATOR(
+    CopyFromCPUInput,
+    CopyOp<CUDAContext, CUDAContext, CPUContext>);
+
 // CopyGPUToCPU and CopyCPUToGPU should both be carried out in a cuda context,
 // since gpu code will be involved.
-REGISTER_CUDA_OPERATOR(CopyGPUToCPU,
-                       CopyOp<CUDAContext, CPUContext, CUDAContext>);
-REGISTER_CUDA_OPERATOR(CopyCPUToGPU,
-                       CopyOp<CUDAContext, CUDAContext, CPUContext>);
+REGISTER_CUDA_OPERATOR(
+    CopyGPUToCPU,
+    CopyOp<CUDAContext, CPUContext, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    CopyCPUToGPU,
+    CopyOp<CUDAContext, CUDAContext, CPUContext>);
 // If we only specify Copy, we assume that it is a gpu to gpu copy - maybe
 // involving different GPUs.
-REGISTER_CUDA_OPERATOR(Copy,
-                       CopyOp<CUDAContext, CUDAContext, CUDAContext>);
+REGISTER_CUDA_OPERATOR(Copy, CopyOp<CUDAContext, CUDAContext, CUDAContext>);
 
 }  // namespace
 }  // namespace caffe2
diff --git a/caffe2/python/cnn.py b/caffe2/python/cnn.py
index 7a04ba2..6cf3fc8 100644
--- a/caffe2/python/cnn.py
+++ b/caffe2/python/cnn.py
@@ -1,10 +1,11 @@
 from caffe2.python import core
+from caffe2.python.model_helper import ModelHelperBase
 from caffe2.proto import caffe2_pb2
 
 import logging
 
 
-class CNNModelHelper(object):
+class CNNModelHelper(ModelHelperBase):
     """A helper model so we can write CNN models more easily, without having to
     manually define parameter initializations and operators separately.
     """
@@ -12,40 +13,20 @@
     def __init__(self, order="NCHW", name=None,
                  use_cudnn=True, cudnn_exhaustive_search=False,
                  ws_nbytes_limit=None, init_params=True):
-        if name is None:
-            name = "CNN"
-        self.net = core.Net(name)
-        self.param_init_net = core.Net(name + '_init')
-        self.params = []
-        self.param_to_grad = {}
+        super(CNNModelHelper, self).__init__(
+            name="CNN" if name is None else name, init_params=init_params)
+
         self.weights = []
         self.biases = []
         self.order = order
         self.use_cudnn = use_cudnn
         self.cudnn_exhaustive_search = cudnn_exhaustive_search
         self.ws_nbytes_limit = ws_nbytes_limit
-        self.init_params = init_params
-        self.gradient_ops_added = False
         if self.order != "NHWC" and self.order != "NCHW":
             raise ValueError(
                 "Cannot understand the CNN storage order %s." % self.order
             )
 
-    def Proto(self):
-        return self.net.Proto()
-
-    def InitProto(self):
-        return self.param_init_net.Proto()
-
-    def RunAllOnGPU(self, *args, **kwargs):
-        self.param_init_net.RunAllOnGPU(*args, **kwargs)
-        self.net.RunAllOnGPU(*args, **kwargs)
-
-    def CreateDB(self, blob_out, db, db_type, **kwargs):
-        dbreader = self.param_init_net.CreateDB(
-            [], blob_out, db=db, db_type=db_type, **kwargs)
-        return dbreader
-
     def ImageInput(
             self, blob_in, blob_out, **kwargs
     ):
@@ -59,17 +40,6 @@
                 blob_in, blob_out, **kwargs)
         return data, label
 
-    def TensorProtosDBInput(
-        self, unused_blob_in, blob_out, batch_size, db, db_type, **kwargs
-    ):
-        """TensorProtosDBInput."""
-        dbreader_name = "dbreader_" + db
-        dbreader = self.param_init_net.CreateDB(
-            [], dbreader_name,
-            db=db, db_type=db_type)
-        return self.net.TensorProtosDBInput(
-            dbreader, blob_out, batch_size=batch_size)
-
     def Conv(
         self, blob_in, blob_out, dim_in, dim_out, kernel, weight_init=None,
         bias_init=None, **kwargs
@@ -237,8 +207,8 @@
         )
         return concat
 
-    def FC(
-        self, blob_in, blob_out, dim_in, dim_out, weight_init=None,
+    def _FC_or_packed_FC(
+        self, op_call, blob_in, blob_out, dim_in, dim_out, weight_init=None,
         bias_init=None, **kwargs
     ):
         """FC"""
@@ -264,7 +234,15 @@
             bias = core.ScopedBlobReference(
                 blob_out + '_b', self.param_init_net)
         self.params.extend([weight, bias])
-        return self.net.FC([blob_in, weight, bias], blob_out, **kwargs)
+        self.weights.append(weight)
+        self.biases.append(bias)
+        return op_call([blob_in, weight, bias], blob_out, **kwargs)
+
+    def FC(self, *args, **kwargs):
+        return self._FC_or_packed_FC(self.net.FC, *args, **kwargs)
+
+    def PackedFC(self, *args, **kwargs):
+        return self._FC_or_packed_FC(self.net.PackedFC, *args, **kwargs)
 
     def FC_Decomp(
         self, blob_in, blob_out, dim_in, dim_out,
@@ -431,7 +409,7 @@
         """Depth Concat."""
         return self.net.Concat(
             blobs_in,
-            [blob_out, "_" + blob_out + "_condat_dims"],
+            [blob_out, "_" + blob_out + "_concat_dims"],
             order=self.order,
             **kwargs
         )[0]
@@ -451,6 +429,10 @@
         """Transpose."""
         return self.net.Transpose(blob_in, blob_out, **kwargs)
 
+    def Sum(self, blob_in, blob_out, **kwargs):
+        """Sum"""
+        return self.net.Sum(blob_in, blob_out, **kwargs)
+
     def SpatialBN(self, blob_in, blob_out, dim_in, **kwargs):
         blob_out = blob_out or self.net.NextName()
         # Input: input, scale, bias, est_mean, est_inv_var
@@ -465,13 +447,15 @@
             return self.param_init_net.ConstantFill(
                 [], blob_out + "_" + suffix, shape=[dim_in], value=value)
         scale, bias = init_blob(1.0, "s"), init_blob(0.0, "b")
+        running_mean = init_blob(0.0, "rm")
+        running_inv_var = init_blob(1.0, "riv")
         self.params.extend([scale, bias])
         self.weights.append(scale)
         self.biases.append(bias)
-        blob_outs = [blob_out, blob_out + "_rm", blob_out + "_riv",
+        blob_outs = [blob_out, running_mean, running_inv_var,
                      blob_out + "_sm", blob_out + "_siv"]
         blob_outputs = self.net.SpatialBN(
-            [blob_in, scale, bias, blob_outs[1], blob_outs[2]], blob_outs,
+            [blob_in, scale, bias, running_mean, running_inv_var], blob_outs,
             order=self.order, **kwargs)
         # Return the output
         return blob_outputs[0]
@@ -500,15 +484,22 @@
     def ZeroInit(self):
         return ('ConstantFill', {})
 
-    def AddGradientOperators(self, *args, **kwargs):
-        if self.gradient_ops_added:
-            raise RuntimeError("You cannot run AddGradientOperators twice.")
-        self.gradient_ops_added = True
-        grad_map = self.net.AddGradientOperators(*args, **kwargs)
-        for p in self.params:
-            if str(p) in grad_map:
-                self.param_to_grad[p] = grad_map[str(p)]
-        return grad_map
+    def AddWeightDecay(self, weight_decay):
+        """Adds a decay to weights in the model.
+
+        This is a form of L2 regularization.
+
+        Args:
+            weight_decay: strength of the regularization
+        """
+        if weight_decay <= 0.0:
+            return
+        wd = self.param_init_net.ConstantFill([], 'wd', shape=[1],
+                                              value=weight_decay)
+        ONE = self.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0)
+        for param in self.weights:
+            #  Equivalent to: grad += wd * param
+            self.net.WeightedSum([self.param_to_grad[param], ONE, param, wd])
 
     @property
     def CPU(self):
@@ -635,32 +626,3 @@
         self.weights += step_net.weights
         self.biases += step_net.biases
         return output, hidden_state, cell_state
-
-    def __getattr__(self, op_type):
-        """Catch-all for all other operators, mostly those without params."""
-        if not core.IsOperator(op_type):
-            raise RuntimeError(
-                'Method ' + op_type + ' is not a registered operator.'
-            )
-        # known_working_ops are operators that do not need special care.
-        known_working_ops = [
-            "Accuracy",
-            "Adam",
-            "AveragedLoss",
-            "Cast",
-            "LabelCrossEntropy",
-            "LearningRate",
-            "Print",
-            "Sigmoid",
-            "Scale",
-            "Snapshot",
-            "Softmax",
-            "StopGradient",
-            "Summarize",
-            "Tanh",
-            "WeightedSum",
-        ]
-        if op_type not in known_working_ops:
-            logging.warning("You are creating an op that the CNNModelHelper "
-                            "does not recognize: {}.".format(op_type))
-        return self.net.__getattr__(op_type)
diff --git a/caffe2/python/control.py b/caffe2/python/control.py
new file mode 100644
index 0000000..9514c3a
--- /dev/null
+++ b/caffe2/python/control.py
@@ -0,0 +1,400 @@
+"""
+Implement functions for controlling execution of nets and steps, including
+  Do
+  DoParallel
+  For-loop
+  While-loop
+  Do-While-loop
+  Switch
+  If
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+
+
+def GetConditionBlobFromNet(condition_net):
+    """
+    The condition blob is the last external_output that must
+    be a single bool
+    """
+    assert len(condition_net.Proto().external_output) > 0, (
+        "Condition net %s must has at least one external output" %
+        condition_net.Proto.name)
+    # we need to use a blob reference here instead of a string
+    # otherwise, it will add another name_scope to the input later
+    # when we create new ops (such as OR of two inputs)
+    return core.BlobReference(condition_net.Proto().external_output[-1])
+
+def NotNet(condition_blob_or_net):
+    """Not of a condition blob or net
+
+    Args:
+    condition_blob_or_net can be either blob or net. If condition_blob_or_net
+    is Net, the condition is its last external_output
+    that must be a single bool.
+
+    returns
+    not_net: the net NOT the input
+    out_blob: the output blob of the not_net
+    """
+    if isinstance(condition_blob_or_net, core.Net):
+        condition_blob = GetConditionBlobFromNet(condition_blob_or_net)
+    else:
+        condition_blob = condition_blob_or_net
+
+    not_net = core.Net('not_net')
+    out_blob = not_net.Not(condition_blob)
+    not_net.AddExternalOutput(out_blob)
+
+    return not_net, out_blob
+
+
+def _CopyConditionBlobNet(condition_blob):
+    """Make a condition net that copies the condition_blob
+
+    Args:
+    condition_blob is a single bool.
+
+    returns
+    not_net: the net NOT the input
+    out_blob: the output blob of the not_net
+    """
+    condition_net = core.Net('copy_condition_blob_net')
+    out_blob = condition_net.Copy(condition_blob)
+    condition_net.AddExternalOutput(out_blob)
+
+    return condition_net, out_blob
+
+
+def MergeConditionNets(name, condition_nets, relation):
+    """
+    Merge multi condition nets into a single condition nets.
+
+    Args:
+        name: name of the new condition net.
+        condition_nets: a list of condition nets. The last external_output
+                        of each condition net must be single bool value.
+        relation: can be 'And' or 'Or'.
+
+    Returns:
+        - A new condition net. Its last external output is relation of all
+          condition_nets.
+    """
+    if not isinstance(condition_nets, list):
+        return condition_nets
+    if len(condition_nets) <= 1:
+        return condition_nets[0] if condition_nets else None
+
+    merged_net = core.Net(name)
+    for i in range(len(condition_nets)):
+        net_proto = condition_nets[i].Proto()
+        assert net_proto.device_option == merged_net.Proto().device_option
+        assert net_proto.type == merged_net.Proto().type
+        merged_net.Proto().op.extend(net_proto.op)
+        merged_net.Proto().external_input.extend(net_proto.external_input)
+        # discard external outputs as we're combining them together
+        curr_cond = GetConditionBlobFromNet(condition_nets[i])
+        if i == 0:
+            last_cond = curr_cond
+        else:
+            last_cond = merged_net.__getattr__(relation)([last_cond, curr_cond])
+
+    merged_net.AddExternalOutput(last_cond)
+
+    return merged_net
+
+
+def Do(*nets_or_steps):
+    """
+    Execute the sequence of nets or steps once.
+
+    Examples:
+    - Do(net1, net2, ..., net_n)
+    - Do(list_of_nets)
+    - Do(step1, step2, ..., step_n)
+    - Do(list_of_steps)
+    """
+    if len(nets_or_steps) == 0:
+        raise ValueError(
+            'nets_or_steps cannot be empty.')
+    elif len(nets_or_steps) == 1:
+        nets_or_steps = nets_or_steps[0]
+    else:
+        nets_or_steps = list(nets_or_steps)
+
+    return core.execution_step('Do', nets_or_steps)
+
+
+def DoParallel(*nets_or_steps):
+    """
+    Execute the nets or steps in parallel, waiting for all of them to finish
+
+    Examples:
+    - DoParallel(net1, net2, ..., net_n)
+    - DoParallel(list_of_nets)
+    - DoParallel(step1, step2, ..., step_n)
+    - DoParallel(list_of_steps)
+    """
+    if len(nets_or_steps) == 0:
+        raise ValueError(
+            'nets_or_steps cannot be empty.')
+    elif len(nets_or_steps) == 1:
+        nets_or_steps = nets_or_steps[0]
+    else:
+        nets_or_steps = list(nets_or_steps)
+
+    return core.execution_step(
+        'DoParallel', nets_or_steps, concurrent_substeps=True)
+
+
+def _StopNet(stop_blob):
+    stop_net = core.Net('stop_net')
+    stop_net.ConstantFill(
+        [], [stop_blob], shape=[], value=True, dtype=core.DataType.BOOL)
+    return stop_net
+
+
+def _ToExecutionStep(net_or_step):
+    if isinstance(net_or_step, core.Net):
+        return Do(net_or_step)
+    elif isinstance(net_or_step, core.ExecutionStep):
+        return net_or_step
+    else:
+        raise ValueError(
+            'net_or_step must be a net or a step.')
+
+
+def _RunOnceIf(condition_blob_or_net, net_or_step):
+    """
+    Execute net_or_step once if condition_blob_or_net evaluates as true.
+
+    If condition_blob_or_net is Net, the condition is its last external_output
+    that must be a single bool. And this net will be executed before net_or_step
+    so as to get the condition.
+    """
+    if isinstance(condition_blob_or_net, core.Net):
+        condition_blob = GetConditionBlobFromNet(condition_blob_or_net)
+        return Do(Do(condition_blob_or_net),
+                  _RunOnceIf(condition_blob, net_or_step))
+
+    stop_if_not_net, stop_blob = NotNet(condition_blob_or_net)
+    stop_net = _StopNet(stop_blob)
+
+    return core.execution_step(
+        '_RunOnceIf',
+        [Do(stop_if_not_net), _ToExecutionStep(net_or_step), Do(stop_net)],
+        should_stop_blob=stop_blob)
+
+
+def _RunOnceIfNot(condition_blob_or_net, net_or_step):
+    """
+    Similar to _RunOnceIf() but Execute net_or_step once if
+    condition_blob_or_net evaluates as false.
+    """
+    if isinstance(condition_blob_or_net, core.Net):
+        condition_blob = GetConditionBlobFromNet(condition_blob_or_net)
+        return Do(Do(condition_blob_or_net),
+                  _RunOnceIfNot(condition_blob, net_or_step))
+
+    stop_if_net, stop_blob = _CopyConditionBlobNet(condition_blob_or_net)
+    stop_net = _StopNet(stop_blob)
+
+    return core.execution_step(
+        '_RunOnceIfNot',
+        [Do(stop_if_net), _ToExecutionStep(net_or_step), Do(stop_net)],
+        should_stop_blob=stop_blob)
+
+
+def For(net_or_step, iter_num):
+    """
+    Execute net_or_step iter_num times.
+
+    Args:
+    net_or_step: an instance of a ExecutionStep or a Net.
+    iter_num:    the number times to execute the net_or_step.
+
+    Returns:
+    A ExecutionStep instance.
+    """
+    init_net = core.Net('init-net')
+    iter_cnt = init_net.CreateCounter([], init_count=iter_num)
+    iter_net = core.Net('For-iter')
+    iter_done = iter_net.CountDown([iter_cnt])
+
+    if isinstance(net_or_step, core.Net):
+        for_step = core.execution_step(
+            'For', [iter_net, net_or_step], should_stop_blob=iter_done)
+    elif isinstance(net_or_step, core.ExecutionStep):
+        for_step = core.execution_step(
+            'For', [Do(iter_net), net_or_step], should_stop_blob=iter_done)
+    else:
+        raise ValueError(
+            'net_or_step must be a net or a step.')
+
+    return Do(Do(init_net), for_step)
+
+
+def While(condition_blob_or_net, net_or_step):
+    """
+    Execute net_or_step when condition_blob_or_net returns true.
+
+    Args:
+    condition_blob_or_net: If it is an instance of Net, its last
+      external_output must be a single bool.
+    net_or_step: an instance of a ExecutionStep or a Net.
+
+    Returns:
+    A ExecutionStep instance.
+    """
+    condition_not_net, stop_blob = NotNet(condition_blob_or_net)
+    if isinstance(condition_blob_or_net, core.Net):
+        condition_step = Do(condition_blob_or_net, condition_not_net)
+    else:
+        condition_step = Do(condition_not_net)
+
+    return core.execution_step(
+        'While',
+        [condition_step, _ToExecutionStep(net_or_step)],
+        should_stop_blob=stop_blob)
+
+
+def Until(condition_blob_or_net, net_or_step):
+    """
+    Similar to While() but execute net_or_step when
+    condition_blob_or_net returns false
+    """
+    if isinstance(condition_blob_or_net, core.Net):
+        stop_blob = GetConditionBlobFromNet(condition_blob_or_net)
+        condition_step = Do(condition_blob_or_net)
+    else:
+        copy_net, stop_blob = _CopyConditionBlobNet(condition_blob_or_net)
+        condition_step = Do(copy_net)
+
+    return core.execution_step(
+        'Until',
+        [condition_step, _ToExecutionStep(net_or_step)],
+        should_stop_blob=stop_blob)
+
+
+def DoWhile(condition_blob_or_net, net_or_step):
+    """
+    Execute net_or_step when condition_blob_or_net returns true. It will execute
+    net_or_step at least once.
+
+    Args:
+    condition_blob_or_net: if it is an instance of Net, tts last external_output
+      must be a single bool.
+    net_or_step: an instance of a ExecutionStep or a Net.
+
+    Returns:
+    A ExecutionStep instance.
+    """
+    condition_not_net, stop_blob = NotNet(condition_blob_or_net)
+    if isinstance(condition_blob_or_net, core.Net):
+        condition_step = Do(condition_blob_or_net, condition_not_net)
+    else:
+        condition_step = Do(condition_not_net)
+
+    return core.execution_step(
+        'DoWhile',
+        [_ToExecutionStep(net_or_step), condition_step],
+        should_stop_blob=stop_blob)
+
+
+def DoUntil(condition_blob_or_net, net_or_step):
+    """
+    Similar to DoWhile() but execute net_or_step when
+    condition_blob_or_net returns false
+    """
+    steps = [_ToExecutionStep(net_or_step)]
+
+    if isinstance(condition_blob_or_net, core.Net):
+        steps.append(Do(condition_blob_or_net))
+        stop_blob = GetConditionBlobFromNet(condition_blob_or_net)
+    else:
+        stop_blob = condition_blob_or_net
+
+    stop_blob = core.BlobReference(str(stop_blob))
+    return core.execution_step('DoUntil', steps, should_stop_blob=stop_blob)
+
+
+def Switch(*conditions):
+    """
+    Execute the steps for which the condition is true.
+    Each condition is a tuple (condition_blob_or_net, step).
+    Note:
+      1. Multi steps can be executed if their conditions are true.
+      2. The conditions_blob_or_net (if it is Net) of all steps will be
+         executed once.
+
+    Examples:
+    - Switch((cond_1, net_1), (cond_2, net_2), ..., (cond_n, net_n))
+    - Switch([(cond_1, net1), (cond_2, net_2), ..., (cond_n, net_n)])
+    - Switch((cond_1, net_1))
+    """
+    if len(conditions) == 0:
+        raise ValueError(
+            'conditions cannot be empty.')
+    elif len(conditions) == 1:
+        conditions = conditions[0]
+        if not isinstance(conditions, list):
+            conditions = [conditions]
+    else:
+        conditions = list(conditions)
+
+    return core.execution_step(
+        'Switch', [_RunOnceIf(cond, step) for cond, step in conditions])
+
+
+def If(condition_blob_or_net, true_net_or_step, false_net_or_step=None):
+    """
+    condition_blob_or_net is first evaluated or executed. If the condition is
+    true, true_net_or_step is then executed, otherwise, false_net_or_step
+    is executed.
+
+    If condition_blob_or_net is Net, the condition is its last external_output
+    that must be a single bool. And this Net will be executred before both
+    true/false_net_or_step so as to get the condition.
+    """
+    if not false_net_or_step:
+        return _RunOnceIf(condition_blob_or_net, true_net_or_step)
+
+    if isinstance(condition_blob_or_net, core.Net):
+        condition_blob = GetConditionBlobFromNet(condition_blob_or_net)
+        return Do(Do(condition_blob_or_net),
+                  If(condition_blob, true_net_or_step, false_net_or_step))
+
+    condition_blob = condition_blob_or_net
+    not_net, _ = NotNet(condition_blob)
+
+    return Switch(
+        (condition_blob, true_net_or_step),
+        (not_net, false_net_or_step),
+    )
+
+
+def IfNot(condition_blob_or_net, true_net_or_step, false_net_or_step=None):
+    """
+    If condition_blob_or_net returns false, executes true_net_or_step,
+    otherwise executes false_net_or_step
+    """
+    if not false_net_or_step:
+        return _RunOnceIfNot(condition_blob_or_net, true_net_or_step)
+
+    if isinstance(condition_blob_or_net, core.Net):
+        condition_blob = GetConditionBlobFromNet(condition_blob_or_net)
+        return Do(Do(condition_blob_or_net),
+                  IfNot(condition_blob, true_net_or_step, false_net_or_step))
+
+    condition_blob = condition_blob_or_net
+    not_net, _ = NotNet(condition_blob)
+
+    return Switch(
+        (condition_blob, false_net_or_step),
+        (not_net, true_net_or_step),
+    )
diff --git a/caffe2/python/control_test.py b/caffe2/python/control_test.py
new file mode 100644
index 0000000..066f7a6
--- /dev/null
+++ b/caffe2/python/control_test.py
@@ -0,0 +1,217 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import control, core, test_util, workspace
+
+import logging
+logger = logging.getLogger(__name__)
+
+
+class TestControl(test_util.TestCase):
+    def setUp(self):
+        super(TestControl, self).setUp()
+        self.N_ = 10
+
+        self.init_net_ = core.Net("init-net")
+        cnt = self.init_net_.CreateCounter([], init_count=0)
+        const_n = self.init_net_.ConstantFill(
+            [], shape=[], value=self.N_, dtype=core.DataType.INT64)
+        const_0 = self.init_net_.ConstantFill(
+            [], shape=[], value=0, dtype=core.DataType.INT64)
+
+        self.cnt_net_ = core.Net("cnt-net")
+        self.cnt_net_.CountUp([cnt])
+        curr_cnt = self.cnt_net_.RetrieveCount([cnt])
+        self.init_net_.ConstantFill(
+            [], [curr_cnt], shape=[], value=0, dtype=core.DataType.INT64)
+        self.cnt_net_.AddExternalOutput(curr_cnt)
+
+        self.cond_net_ = core.Net("cond-net")
+        cond_blob = self.cond_net_.LT([curr_cnt, const_n])
+        self.cond_net_.AddExternalOutput(cond_blob)
+
+        self.not_cond_net_ = core.Net("not-cond-net")
+        cond_blob = self.not_cond_net_.GE([curr_cnt, const_n])
+        self.not_cond_net_.AddExternalOutput(cond_blob)
+
+        self.true_cond_net_ = core.Net("true-cond-net")
+        true_blob = self.true_cond_net_.LT([const_0, const_n])
+        self.true_cond_net_.AddExternalOutput(true_blob)
+
+        self.false_cond_net_ = core.Net("false-cond-net")
+        false_blob = self.false_cond_net_.GT([const_0, const_n])
+        self.false_cond_net_.AddExternalOutput(false_blob)
+
+    def CheckNetOutput(self, nets_and_expects):
+        """
+        Check the net output is expected
+        nets_and_expects is a list of tuples (net, expect)
+        """
+        for net, expect in nets_and_expects:
+            output = workspace.FetchBlob(
+                net.Proto().external_output[-1])
+            self.assertEqual(output, expect)
+
+    def BuildAndRunPlan(self, step):
+        plan = core.Plan("test")
+        plan.AddStep(control.Do(self.init_net_))
+        plan.AddStep(step)
+        self.assertEqual(workspace.RunPlan(plan), True)
+
+    def ForLoopTest(self, net_or_step):
+        step = control.For(net_or_step, self.N_)
+        self.BuildAndRunPlan(step)
+        self.CheckNetOutput([(self.cnt_net_, self.N_)])
+
+    def testForLoopWithNet(self):
+        self.ForLoopTest(self.cnt_net_)
+
+    def testForLoopWithStep(self):
+        step = control.Do(self.cnt_net_)
+        self.ForLoopTest(step)
+
+    def WhileLoopTest(self, net_or_step):
+        step = control.While(self.cond_net_, net_or_step)
+        self.BuildAndRunPlan(step)
+        self.CheckNetOutput([(self.cnt_net_, self.N_)])
+
+    def testWhileLoopWithNet(self):
+        self.WhileLoopTest(self.cnt_net_)
+
+    def testWhileLoopWithStep(self):
+        step = control.Do(self.cnt_net_)
+        self.WhileLoopTest(step)
+
+    def UntilLoopTest(self, net_or_step):
+        step = control.Until(self.not_cond_net_, net_or_step)
+        self.BuildAndRunPlan(step)
+        self.CheckNetOutput([(self.cnt_net_, self.N_)])
+
+    def testUntilLoopWithNet(self):
+        self.UntilLoopTest(self.cnt_net_)
+
+    def testUntilLoopWithStep(self):
+        step = control.Do(self.cnt_net_)
+        self.UntilLoopTest(step)
+
+    def DoWhileLoopTest(self, net_or_step):
+        step = control.DoWhile(self.cond_net_, net_or_step)
+        self.BuildAndRunPlan(step)
+        self.CheckNetOutput([(self.cnt_net_, self.N_)])
+
+    def testDoWhileLoopWithNet(self):
+        self.DoWhileLoopTest(self.cnt_net_)
+
+    def testDoWhileLoopWithStep(self):
+        step = control.Do(self.cnt_net_)
+        self.DoWhileLoopTest(step)
+
+    def DoUntilLoopTest(self, net_or_step):
+        step = control.DoUntil(self.not_cond_net_, net_or_step)
+        self.BuildAndRunPlan(step)
+        self.CheckNetOutput([(self.cnt_net_, self.N_)])
+
+    def testDoUntilLoopWithNet(self):
+        self.DoUntilLoopTest(self.cnt_net_)
+
+    def testDoUntilLoopWithStep(self):
+        step = control.Do(self.cnt_net_)
+        self.DoUntilLoopTest(step)
+
+    def IfCondTest(self, cond_net, expect, cond_on_blob):
+        if cond_on_blob:
+            step = control.Do(
+                control.Do(cond_net),
+                control.If(cond_net.Proto().external_output[-1],
+                           self.cnt_net_))
+        else:
+            step = control.If(cond_net, self.cnt_net_)
+        self.BuildAndRunPlan(step)
+        self.CheckNetOutput([(self.cnt_net_, expect)])
+
+    def testIfCondTrueOnNet(self):
+        self.IfCondTest(self.true_cond_net_, 1, False)
+
+    def testIfCondTrueOnBlob(self):
+        self.IfCondTest(self.true_cond_net_, 1, True)
+
+    def testIfCondFalseOnNet(self):
+        self.IfCondTest(self.false_cond_net_, 0, False)
+
+    def testIfCondFalseOnBlob(self):
+        self.IfCondTest(self.false_cond_net_, 0, True)
+
+    def IfElseCondTest(self, cond_net, expect, cond_on_blob):
+        true_step = control.For(self.cnt_net_, self.N_)
+        false_step = control.For(self.cnt_net_, 2 * self.N_)
+        if cond_on_blob:
+            step = control.Do(
+                control.Do(cond_net),
+                control.If(cond_net.Proto().external_output[-1],
+                           true_step, false_step))
+        else:
+            step = control.If(cond_net, true_step, false_step)
+        self.BuildAndRunPlan(step)
+        self.CheckNetOutput([(self.cnt_net_, expect)])
+
+    def testIfElseCondTrueOnNet(self):
+        self.IfElseCondTest(self.true_cond_net_, self.N_, False)
+
+    def testIfElseCondTrueOnBlob(self):
+        self.IfElseCondTest(self.true_cond_net_, self.N_, True)
+
+    def testIfElseCondFalseOnNet(self):
+        self.IfElseCondTest(self.false_cond_net_, 2 * self.N_, False)
+
+    def testIfElseCondFalseOnBlob(self):
+        self.IfElseCondTest(self.false_cond_net_, 2 * self.N_, True)
+
+    def IfNotCondTest(self, cond_net, expect, cond_on_blob):
+        if cond_on_blob:
+            step = control.Do(
+                control.Do(cond_net),
+                control.IfNot(cond_net.Proto().external_output[-1],
+                              self.cnt_net_))
+        else:
+            step = control.IfNot(cond_net, self.cnt_net_)
+        self.BuildAndRunPlan(step)
+        self.CheckNetOutput([(self.cnt_net_, expect)])
+
+    def testIfNotCondTrueOnNet(self):
+        self.IfNotCondTest(self.true_cond_net_, 0, False)
+
+    def testIfNotCondTrueOnBlob(self):
+        self.IfNotCondTest(self.true_cond_net_, 0, True)
+
+    def testIfNotCondFalseOnNet(self):
+        self.IfNotCondTest(self.false_cond_net_, 1, False)
+
+    def testIfNotCondFalseOnBlob(self):
+        self.IfNotCondTest(self.false_cond_net_, 1, True)
+
+    def IfNotElseCondTest(self, cond_net, expect, cond_on_blob):
+        true_step = control.For(self.cnt_net_, self.N_)
+        false_step = control.For(self.cnt_net_, 2 * self.N_)
+        if cond_on_blob:
+            step = control.Do(
+                control.Do(cond_net),
+                control.IfNot(cond_net.Proto().external_output[-1],
+                              true_step, false_step))
+        else:
+            step = control.IfNot(cond_net, true_step, false_step)
+        self.BuildAndRunPlan(step)
+        self.CheckNetOutput([(self.cnt_net_, expect)])
+
+    def testIfNotElseCondTrueOnNet(self):
+        self.IfNotElseCondTest(self.true_cond_net_, 2 * self.N_, False)
+
+    def testIfNotElseCondTrueOnBlob(self):
+        self.IfNotElseCondTest(self.true_cond_net_, 2 * self.N_, True)
+
+    def testIfNotElseCondFalseOnNet(self):
+        self.IfNotElseCondTest(self.false_cond_net_, self.N_, False)
+
+    def testIfNotElseCondFalseOnBlob(self):
+        self.IfNotElseCondTest(self.false_cond_net_, self.N_, True)
diff --git a/caffe2/python/convnet_benchmarks.py b/caffe2/python/convnet_benchmarks.py
index b4f3d5b..810a2bf 100644
--- a/caffe2/python/convnet_benchmarks.py
+++ b/caffe2/python/convnet_benchmarks.py
@@ -490,6 +490,17 @@
     return model, 224
 
 
+def AddParameterUpdate(model):
+    """ Simple plain SGD update -- not tuned to actually train the models """
+    ITER = model.Iter("iter")
+    LR = model.LearningRate(
+        ITER, "LR", base_lr=-1e-8, policy="step", stepsize=10000, gamma=0.999)
+    ONE = model.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0)
+    for param in model.params:
+        param_grad = model.param_to_grad[param]
+        model.WeightedSum([param, ONE, param_grad, LR], param)
+
+
 def Benchmark(model_gen, arg):
     model, input_size = model_gen(arg.order)
     model.Proto().type = arg.net_type
@@ -524,6 +535,7 @@
     else:
         print('{}: running forward-backward.'.format(arg.model))
         model.AddGradientOperators(["loss"])
+        AddParameterUpdate(model)
         if arg.order == 'NHWC':
             print(
                 '==WARNING==\n'
diff --git a/caffe2/python/core.py b/caffe2/python/core.py
index 9d42cad..81147c5 100644
--- a/caffe2/python/core.py
+++ b/caffe2/python/core.py
@@ -57,7 +57,7 @@
     return (op_type + "_ENGINE_" + engine in _REGISTERED_OPERATORS)
 
 
-def DeviceOption(device_type, cuda_gpu_id, random_seed=None):
+def DeviceOption(device_type, cuda_gpu_id=0, random_seed=None):
     option = caffe2_pb2.DeviceOption()
     option.device_type = device_type
     option.cuda_gpu_id = cuda_gpu_id
@@ -106,6 +106,9 @@
     def __str__(self):
         return self._name
 
+    def __repr__(self):
+        return 'BlobReference("{}")'.format(self._name)
+
     def __add__(self, other):
         if not isinstance(other, basestring):
             raise RuntimeError('Cannot add BlobReference to a non-string.')
@@ -492,10 +495,17 @@
             if (len(input_usage) <= 1 or fwd_op_idx != input_usage[0]):
                 # We do not need to do gradient accumulation yet.
                 continue
-
             generator = self.gradient_generators[input_name][input_version]
-            if not self._VerifyGradientGenerators(generator):
-                continue
+            try:
+                if not self._VerifyGradientGenerators(generator):
+                    continue
+            except RuntimeError as err:
+                raise RuntimeError(
+                    "Gradients for param ''{}'' failed to verity: {}".format(
+                        input_name,
+                        err
+                    )
+                )
 
             # Finally, let's create the sum operator.
             sum_op = self._MakeSumOp(input_name, input_version)
@@ -1125,6 +1135,24 @@
         return netlike
 
 
+def output_to_list(op_output):
+    """
+    Ensures that the output of an operator is a list.
+    Use when an operator has a variable number of outputs, but a list of
+    outputs is desired even when number of outputs is 1.
+
+    Args:
+        op_output: Either a BlobReferenece or an iterable of BlobReferences.
+
+    Returns:
+        A list of BlobReferences.
+    """
+    assert type(op_output) in (list, tuple, BlobReference)
+    return (
+        [op_output]
+        if isinstance(op_output, BlobReference) else list(op_output))
+
+
 def _add_net_to_dict(net_dict, net):
     name = get_net_name(net)
     if net in net_dict:
@@ -1152,6 +1180,9 @@
         if num_iter is not None:
             self._step.num_iter = num_iter
 
+    def Name(self):
+        return self._step.name
+
     def __str__(self):
         return self._step.name
 
@@ -1227,10 +1258,16 @@
 
 
 class Plan(object):
-    def __init__(self, name):
+    def __init__(self, name_or_step):
         self._plan = caffe2_pb2.PlanDef()
-        self._plan.name = name
         self._net_dict = OrderedDict()
+        if isinstance(name_or_step, ExecutionStep):
+            self._plan.name = name_or_step.Name()
+            self.AddStep(name_or_step)
+        elif isinstance(name_or_step, basestring):
+            self._plan.name = name_or_step
+        else:
+            raise ValueError('name_or_step must be a string or ExecutionStep')
 
     def __str__(self):
         return self._plan.name
diff --git a/caffe2/python/data_parallel_model.py b/caffe2/python/data_parallel_model.py
new file mode 100644
index 0000000..d094840
--- /dev/null
+++ b/caffe2/python/data_parallel_model.py
@@ -0,0 +1,407 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from types import FunctionType
+from functools import wraps
+import six
+
+from caffe2.python import cnn, dyndep, scope, workspace, core
+from caffe2.proto import caffe2_pb2
+
+dyndep.InitOpsLibrary("@/caffe2/caffe2/contrib/nccl:nccl_ops")
+
+
+DATAPARALLEL_OPS = [
+    "Conv",
+    "ConvTranspose",
+    "GroupConv",
+    "FC",
+    "FC_Decomp",
+    "FC_Prune",
+    "FC_Sparse",
+    "LRN",
+    "Dropout",
+    "MaxPool",
+    "AveragePool",
+    "Concat",
+    "DepthConcat",
+    "Relu",
+    "Transpose",
+    "SpatialBN",
+    "Accuracy",
+    "Adam",
+    "AveragedLoss",
+    "Cast",
+    "LabelCrossEntropy",
+    "LearningRate",
+    "Print",
+    "Scale",
+    "Snapshot",
+    "Softmax",
+    "StopGradient",
+    "Summarize",
+    "Sum",
+    "Tanh",
+    "WeightedSum",
+    "SquaredL2Distance",
+]
+
+
+class _GPUDataParallelMetaClass(type):
+    """A meta class to patch method in order to distribute them over multiple
+    GPUs.
+    """
+    _devices = []
+
+    @staticmethod
+    def _data_parallel_wrapper(op):
+        @wraps(op)
+        def wrapped(cls, blob_in, blob_out, *args, **kwargs):
+            # Helpers to extract a device specific blob or a global blob
+            def self_or_item(d, key):
+                if isinstance(d, dict):
+                    assert key in d
+                    return d[key]
+                return d
+
+            def get_input(gpu_id):
+                if isinstance(blob_in, list):
+                    return [self_or_item(blob, gpu_id) for blob in blob_in]
+                return self_or_item(blob_in, gpu_id)
+
+            def get_output(gpu_id):
+                return self_or_item(blob_out, gpu_id)
+
+            # If we have explicit device scope, we do not parallelize
+            if cls.explicit_scope():
+                return op(
+                    cls,
+                    blob_in,
+                    blob_out,
+                    *args,
+                    **kwargs)
+
+            devices = _GPUDataParallelMetaClass._devices
+            results = {}
+            for gpu_id in devices:
+                with core.NameScope("gpu_{}".format(gpu_id)):
+                    device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
+                    with core.DeviceScope(device):
+                        result = op(
+                            cls,
+                            get_input(gpu_id),
+                            get_output(gpu_id),
+                            *args,
+                            **kwargs)
+                        results[gpu_id] = result
+            return results
+
+        return wrapped
+
+    def __new__(meta, classname, bases, class_dict):
+        assert len(bases) == 1, "Expects only one base class"
+        base = bases[0]
+        assert base is cnn.CNNModelHelper, "Base class should be CNNModelHelper"
+        new_class_dict = {}
+        for name, attr in base.__dict__.items():
+            if name not in DATAPARALLEL_OPS:
+                continue
+            attr = _GPUDataParallelMetaClass._data_parallel_wrapper(attr)
+            new_class_dict[name] = attr
+        for name, attr in class_dict.items():
+            if name in new_class_dict:
+                continue
+            if isinstance(attr, FunctionType):
+                if name in DATAPARALLEL_OPS:
+                    new_class_dict[name] = \
+                        _GPUDataParallelMetaClass._data_parallel_wrapper(attr)
+                else:
+                    new_class_dict[name] = attr
+        return super(_GPUDataParallelMetaClass, meta).__new__(
+            meta, classname, bases, new_class_dict)
+
+
+@six.add_metaclass(_GPUDataParallelMetaClass)
+class GPUDataParallelModel(cnn.CNNModelHelper):
+    """A helper class that extends CNNModelHelper to support multi GPUs
+    data parallel training.
+    """
+    def __init__(self, devices, *args, **kwargs):
+        assert len(devices) >= 1, "Should have at least 1 GPU devices"
+        assert len(devices) <= workspace.NumCudaDevices(), \
+            "Requested number of devices is greater than the number of GPUs"
+        _GPUDataParallelMetaClass._devices = devices
+        self._devices = devices
+        self._explicit_scope = False
+        self._gradient_reduce_all_added = False
+        super(GPUDataParallelModel, self).__init__(*args, **kwargs)
+
+    def explicit_scope(self):
+        return self._explicit_scope
+
+    def _call(self, name, *args, **kwargs):
+        return super(GPUDataParallelModel, self).__getattr__(
+            name)(*args, **kwargs)
+
+    # TODO(denisy): try out decorators to avoid this code below
+    def Accuracy(self, *args, **kwargs):
+        return self._call("Accuracy", *args, **kwargs)
+
+    def Adam(self, *args, **kwargs):
+        return self._call("Adam", *args, **kwargs)
+
+    def AveragedLoss(self, *args, **kwargs):
+        return self._call("AveragedLoss", *args, **kwargs)
+
+    def Cast(self, *args, **kwargs):
+        return self._call("Cast", *args, **kwargs)
+
+    def LabelCrossEntropy(self, *args, **kwargs):
+        return self._call("LabelCrossEntropy", *args, **kwargs)
+
+    def LearningRate(self, *args, **kwargs):
+        return self._call("LearningRate", *args, **kwargs)
+
+    def Print(self, *args, **kwargs):
+        return self._call("Print", *args, **kwargs)
+
+    def Scale(self, *args, **kwargs):
+        return self._call("Scale", *args, **kwargs)
+
+    def Snapshot(self, *args, **kwargs):
+        return self._call("Snapshot", *args, **kwargs)
+
+    def Softmax(self, *args, **kwargs):
+        return self._call("Softmax", *args, **kwargs)
+
+    def StopGradient(self, *args, **kwargs):
+        return self._call("StopGradient", *args, **kwargs)
+
+    def Sum(self, *args, **kwargs):
+        return self._call("Sum", *args, **kwargs)
+
+    def Summarize(self, *args, **kwargs):
+        return self._call("Summarize", *args, **kwargs)
+
+    def Tanh(self, *args, **kwargs):
+        return self._call("Tanh", *args, **kwargs)
+
+    def WeightedSum(self, *args, **kwargs):
+        return self._call("WeightedSum", *args, **kwargs)
+
+    def SquaredL2Distance(self, *args, **kwargs):
+        return self._call("SquaredL2Distance", *args, **kwargs)
+
+    def FinalizeSetup(self):
+        self.param_init_net.RunAllOnGPU()
+        self.RunAllOnGPU()
+
+        # Setup sync of initial params
+        self._SyncInitialParams()
+
+    def AddGradientOperators(self, params, *args, **kwargs):
+        def create_grad(param):
+            return self.ConstantFill(param, str(param) + "_grad", value=1.0)
+
+        param_grad = {}
+        # Explicitly need to create gradients on each GPU
+        for param in params:
+            if not isinstance(param, dict):
+                grad = create_grad(param)
+                param_grad[str(param)] = str(grad)
+            else:
+                for gpu_id in self._devices:
+                    device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
+                    with core.DeviceScope(device):
+                        assert gpu_id in param
+                        p = param[gpu_id]
+                        g = create_grad(p)
+                        param_grad[str(p)] = str(g)
+
+        return super(GPUDataParallelModel, self).AddGradientOperators(
+            param_grad, *args, **kwargs)
+
+    def AddWeightDecay(self, weight_decay):
+        if weight_decay == 0.0:
+            return
+
+        assert(weight_decay > 0.0)
+
+        self._explicit_scope = True
+        assert \
+            self._gradient_reduce_all_added, \
+            "Weight decay must be done after gradient sync between gpus"
+
+        for gpu_id in self._devices:
+            with core.NameScope("gpu_{}".format(gpu_id)):
+                device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
+                with core.DeviceScope(device):
+                    wd = self.param_init_net.ConstantFill([], 'wd', shape=[1],
+                                                          value=weight_decay)
+                    ONE = self.param_init_net.ConstantFill([], "ONE", shape=[1],
+                                                           value=1.0)
+                    # Only update parameters that belong to the current GPU
+                    params = self._CurrentScopeParams()
+
+                    # Take only params that are weights
+                    print("Adding weigth-decay for gpu {}.".format(gpu_id))
+
+                    gpu_weights = [p for p in params if p in self.weights]
+                    for w in gpu_weights:
+                        # Equivalent to grad -= w * param
+                        grad = self.param_to_grad[w]
+                        self.net.WeightedSum([grad, ONE, w, wd], grad)
+
+        self._explicit_scope = False
+
+    def _SyncInitialParams(self):
+        # TODO(akyrola): replace with NCCLBroadcast when it's working
+        # This doesn't work right now:
+        # with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)):
+        #     workspace.RunOperatorOnce(
+        #         core.CreateOperator(
+        #             'NCCLBroadcast', model.params, model.params, root=0))
+        unique_param_names = set(
+            stripParamName(p)
+            for p in self.params
+        )
+
+        self._explicit_scope = True
+        # Copy params from gpu_0 to other
+        for param in unique_param_names:
+            for gpu_idx in self._devices[1:]:
+                device_opt = core.DeviceOption(caffe2_pb2.CUDA, gpu_idx)
+                with core.DeviceScope(device_opt):
+                    self.param_init_net.Copy(
+                        "gpu_{}/{}".format(self._devices[0], param),
+                        "gpu_{}/{}".format(gpu_idx, param)
+                    )
+        self._explicit_scope = False
+
+    def _AllReduceGradients(self):
+        """Performs NCCL AllReduce to distribute gradients to all the GPUs."""
+
+        self._gradient_reduce_all_added = True
+
+        if len(self._devices) == 1:
+            return
+
+        # Take only params that have gradient associated with them.
+        unique_grads_names = set(
+            stripParamName(grad)
+            for grad in self.param_to_grad.values()
+        )
+        # Now we need to Allreduce gradients on all the GPUs.
+        # Pick GPU #0 as a master GPU.
+        self._explicit_scope = True
+        with core.DeviceScope(
+            core.DeviceOption(caffe2_pb2.CUDA, self._devices[0])
+        ):
+            # Group by grads for reduce.
+            for grad_name in unique_grads_names:
+                grads_group = [
+                    grad
+                    for grad in self.param_to_grad.values()
+                    if stripParamName(grad) == grad_name
+                ]
+                assert len(grads_group) == len(self._devices), \
+                    "Each GPU from {}, should have a copy of {}.".format(
+                        self._devices, grad_name)
+                self.NCCLAllreduce(grads_group, grads_group)
+        self._explicit_scope = False
+
+    def _BuildLR(self, base_lr, policy="fixed", **other_lr_params):
+        """A helper to create learning rate."""
+        ITER = self.Iter("ITER")
+        # There is one interesting thing here: since we are minimizing, we are
+        # doing "descent" so the learning rate is set to be negative.
+        LR = self.net.LearningRate(
+            [ITER],
+            "LR",
+            base_lr=base_lr,
+            policy=policy,
+            **other_lr_params
+        )
+        return LR
+
+    def _BuildSGD(self, params, base_lr, policy="fixed", **other_lr_params):
+        """A helper to construct gradient update for SGD."""
+        base_lr = base_lr / len(self._devices)
+        LR = self._BuildLR(base_lr, policy, **other_lr_params)
+        ONE = self.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0)
+        for param in params:
+            grad = self.param_to_grad[param]
+            if isinstance(grad, core.GradientSlice):
+                self.ScatterWeightedSum(
+                    [param, ONE, grad.indices, grad.values, LR], param
+                )
+            else:
+                self.WeightedSum([param, ONE, grad, LR], param)
+
+    def _CurrentScopeParams(self):
+        return [
+            param
+            for param in self.param_to_grad.keys()
+            if str(param).startswith(scope.NAMESCOPE)
+        ]
+
+    def SGD(self, base_lr, policy="fixed", **other_lr_params):
+        """Adds SGD optimizer to the model."""
+        self._AllReduceGradients()
+
+        # Create update params operators.
+        self._explicit_scope = True
+        for gpu_id in self._devices:
+            with core.NameScope("gpu_{}".format(gpu_id)):
+                device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
+                with core.DeviceScope(device):
+                    # Only update parameters that belong to the current GPU
+                    params = self._CurrentScopeParams()
+
+                    # Add optimizer update operators
+                    self._BuildSGD(params, base_lr, policy, **other_lr_params)
+        self._explicit_scope = False
+
+    def CustomSGD(
+        self,
+        paramup_build_fn,
+        base_lr,
+        lr_policy,
+        weight_decay,
+        **other_lr_pars
+    ):
+        """Custom parameter update function"""
+        self._AllReduceGradients()
+
+        self.AddWeightDecay(weight_decay)
+
+        # Run parameter update on each machine
+        self._explicit_scope = True
+        for gpu_id in self._devices:
+            with core.NameScope("gpu_{}".format(gpu_id)):
+                device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
+                with core.DeviceScope(device):
+                    LR = self._BuildLR(base_lr, lr_policy, **other_lr_pars)
+
+                    params = self._CurrentScopeParams()
+                    paramup_build_fn(self, params, LR)
+        self._explicit_scope = False
+
+    def ExecOnEachDevice(self, fn, *args, **kwargs):
+        self._explicit_scope = True
+        for gpu_id in self._devices:
+            with core.NameScope("gpu_{}".format(gpu_id)):
+                device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
+                with core.DeviceScope(device):
+                    fn(self, *args, **kwargs)
+
+        self._explicit_scope = False
+
+
+# A helper function to extract a parameter's name
+def stripParamName(param):
+    # Format is "a/b/c/d" -> d
+    name = str(param)
+    sep = scope._NAMESCOPE_SEPARATOR
+    return name[name.rindex(sep) + 1:]
diff --git a/caffe2/python/data_parallel_model_test.py b/caffe2/python/data_parallel_model_test.py
new file mode 100644
index 0000000..653838d
--- /dev/null
+++ b/caffe2/python/data_parallel_model_test.py
@@ -0,0 +1,60 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import unittest
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core, workspace, data_parallel_model
+from caffe2.python.test_util import TestCase
+
+
+@unittest.skipIf(not workspace.has_gpu_support, "No gpu support.")
+@unittest.skipIf(workspace.NumCudaDevices() < 2, "Need at least 2 GPUs.")
+class GPUDataParallelModelTest(TestCase):
+    def test(self):
+        gpu_devices = [0, 1]  # gpu ids
+        perfect_model = np.array([2, 6, 5, 0, 1]).astype(np.float32)
+        np.random.seed(123)
+        data = np.random.randint(
+            2, size=(50, perfect_model.size)
+        ).astype(np.float32)
+        label = np.dot(data, perfect_model)[:, np.newaxis]
+
+        model = data_parallel_model.GPUDataParallelModel(
+            gpu_devices, order="NHWC", name="fake")
+
+        fc = model.FC("data", "fc", perfect_model.size, 1,
+                      ("ConstantFill", {}), ("ConstantFill", {}), axis=0)
+        sq = model.SquaredL2Distance([fc, "label"], "sq")
+        loss = model.AveragedLoss(sq, "loss")
+        model.AddGradientOperators([loss])
+        model.SGD(-0.1)
+        model.RunAllOnGPU()
+
+        for gpu_id in gpu_devices:
+            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, gpu_id)):
+                workspace.FeedBlob(
+                    "gpu_{}/data".format(gpu_id), data[0])
+                workspace.FeedBlob(
+                    "gpu_{}/label".format(gpu_id), label[0])
+
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.CreateNet(model.net)
+
+        for i in range(2000):
+            idx = np.random.randint(data.shape[0])
+            for gpu_id in gpu_devices:
+                device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
+                with core.DeviceScope(device):
+                    workspace.FeedBlob(
+                        "gpu_{}/data".format(gpu_id), data[idx])
+                    workspace.FeedBlob(
+                        "gpu_{}/label".format(gpu_id), label[idx])
+            workspace.RunNet(model.net)
+
+        for gpu_id in gpu_devices:
+            np.testing.assert_allclose(
+                perfect_model[np.newaxis, :],
+                workspace.FetchBlob("gpu_{}/fc_w".format(gpu_id)),
+                atol=1e-2)
diff --git a/caffe2/python/dataio.py b/caffe2/python/dataio.py
index 6db919c..6878afc 100644
--- a/caffe2/python/dataio.py
+++ b/caffe2/python/dataio.py
@@ -36,6 +36,24 @@
         assert self._schema is not None, 'Schema not provided for this reader.'
         return self._schema
 
+    def setup_ex(self, init_net, finish_net):
+        """Nets to be executed once at startup and finish.
+           Experimental extension. Don't use yet"""
+        pass
+
+    def read_ex(self, local_init_net, local_finish_net):
+        """Experimental extension to the interface. Don't use yet"""
+        read_net = core.Net('reader_body')
+        return ([read_net], ) + self.read(read_net)
+
+    def read_record_ex(self, local_init_net, local_finish_net):
+        """Experimental extension to the interface. Don't use yet"""
+        nets, should_stop, fields = self.read_ex(
+            local_init_net, local_finish_net)
+        if self._schema:
+            fields = from_blob_list(self._schema, fields)
+        return nets, should_stop, fields
+
     """
     Reader is a abstract class to be implemented in order to provide
     operations capable of iterating through a dataset or stream of data.
@@ -151,10 +169,31 @@
             fields = fields.field_blobs()
         self.write(writer_net, fields)
 
+    def setup_ex(self, init_net, finish_net):
+        """Experimental, don't use yet"""
+        self.commit(finish_net)
+
+    def write_ex(self, fields, local_init_net, local_finish_net, stop_blob):
+        """Experimental extension to the interface. Don't use yet"""
+        write_net = core.Net('write_net')
+        self.write(write_net, fields)
+        return [write_net]
+
+    def write_record_ex(
+            self, fields, local_init_net, local_finish_net, stop_blob=None):
+        """Experimental extension to the interface. Don't use yet."""
+        if isinstance(fields, Field):
+            fields = fields.field_blobs()
+        if stop_blob is None:
+            stop_blob = local_init_net.NextName("dequeue_status")
+        write_nets = self.write_ex(
+            fields, local_init_net, local_finish_net, stop_blob)
+        return (write_nets, stop_blob)
+
     def commit(self, finish_net):
         """Add operations to `finish_net` that signal end of data.
 
         This must be implemented by all Writers, but may be no-op for some
         of them.
         """
-        raise NotImplementedError('Writers must implement commit.')
+        pass
diff --git a/caffe2/python/experiment_util.py b/caffe2/python/experiment_util.py
new file mode 100644
index 0000000..333ad7d
--- /dev/null
+++ b/caffe2/python/experiment_util.py
@@ -0,0 +1,52 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import datetime
+import time
+
+from collections import OrderedDict
+
+'''
+Utilities for logging experiment run stats, such as accuracy
+and loss over time for different runs. Runtime arguments are stored
+in the log.
+'''
+
+
+class ModelTrainerLog():
+
+    def __init__(self, expname, runtime_args):
+        now = datetime.datetime.fromtimestamp(time.time())
+        self.experiment_id = now.strftime('%Y%m%d_%H%M%S')
+        self.filename = "%s_%s.log" % (expname, self.experiment_id)
+        self.logstr("# %s" % str(runtime_args))
+        self.headers = None
+        self.start_time = time.time()
+
+    def logstr(self, str):
+        with open(self.filename, "a") as f:
+            f.write(str + "\n")
+            f.close()
+        print(str)
+
+    def log(self, input_count, batch_count, additional_values):
+        logdict = OrderedDict()
+        logdict['time'] = time.time() - self.start_time
+        logdict['input_counter'] = input_count
+        logdict['batch_count'] = batch_count
+        if logdict['time'] > 0:
+            logdict['inputs_per_sec'] = input_count / logdict['time']
+        else:
+            logdict['inputs_per_sec'] = 0.0
+
+        for k in sorted(additional_values.keys()):
+            logdict[k] = additional_values[k]
+
+        # Write the headers if they are not written yet
+        if self.headers is None:
+            self.headers = logdict.keys()[:]
+            self.logstr(",".join(self.headers))
+
+        self.logstr(",".join([str(v) for v in logdict.values()]))
diff --git a/caffe2/python/extension_loader.py b/caffe2/python/extension_loader.py
index 6b8573d..e5c0907 100644
--- a/caffe2/python/extension_loader.py
+++ b/caffe2/python/extension_loader.py
@@ -3,15 +3,11 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 import contextlib
-import sys
-import DLFCN
-
 
 @contextlib.contextmanager
 def DlopenGuard():
-    # In python 2.7 required constants are not defined.
-    # Thus they are listed explicitly
-    flags = sys.getdlopenflags()
-    sys.setdlopenflags(DLFCN.RTLD_GLOBAL | DLFCN.RTLD_NOW)
+    # This is a stub for setting up special tricks around python extensions
+    # loading. For example, it might do
+    #   sys.setdlopenflags(DLFCN.RTLD_GLOBAL | DLFCN.RTLD_NOW)
+    # which might be required in some setups of python
     yield
-    sys.setdlopenflags(flags)
diff --git a/caffe2/python/gradient_checker.py b/caffe2/python/gradient_checker.py
index ffd1f9e..edf9118 100644
--- a/caffe2/python/gradient_checker.py
+++ b/caffe2/python/gradient_checker.py
@@ -42,7 +42,17 @@
         # Run gradient ops
         workspace.RunOperatorsOnce(grad_ops)
         # Get gradients
-        grad = workspace.FetchBlob(grad_name)
+        if isinstance(grad_name, core.GradientSlice):
+            workspace.FeedBlob('zeros', np.zeros_like(x, dtype=np.float32))
+            workspace.FeedBlob('one', np.ones(1, dtype=np.float32))
+            sparse_to_dense_op = core.CreateOperator(
+                'ScatterWeightedSum',
+                ['zeros', 'one', grad_name.indices, grad_name.values, 'one'],
+                'zeros')
+            workspace.RunOperatorOnce(sparse_to_dense_op)
+            grad = workspace.FetchBlob('zeros')
+        else:
+            grad = workspace.FetchBlob(grad_name)
         return loss, grad
 
     def CheckSimple(
@@ -86,10 +96,6 @@
             grad_ops, g_input = core.GradientRegistry.GetGradientForOp(
                 op, [s + '_grad' for s in op.output])
 
-        # sanity check: we only support dense gradient checking in this checker
-        assert all(type(g) is not core.GradientSlice for g in g_input), \
-               "This checker does not support sparse gradient yet."""
-
         dims_to_check = inputs[input_to_check].size
         # First, feed in the input.
         for i, arr in enumerate(inputs):
diff --git a/caffe2/python/hypothesis_test.py b/caffe2/python/hypothesis_test.py
index e0df47c..5cb2e7d 100644
--- a/caffe2/python/hypothesis_test.py
+++ b/caffe2/python/hypothesis_test.py
@@ -46,7 +46,7 @@
     f_t = sigmoid(f_t)
     o_t = sigmoid(o_t)
     g_t = tanh(g_t)
-    valid = (seq_lengths < t).astype(np.int32)
+    valid = (t < seq_lengths).astype(np.int32)
     assert valid.shape == (N, D)
     cell_t = ((f_t * cell_t_prev) + (i_t * g_t)) * (valid) + \
         (1 - valid) * cell_t_prev
@@ -132,6 +132,7 @@
 
 
 class TestOperators(hu.HypothesisTestCase):
+
     def test_comparison_ops(self):
         ops = {"LT": lambda x1, x2: [x1 < x2],
                "LE": lambda x1, x2: [x1 <= x2],
@@ -584,8 +585,9 @@
            in_place=st.booleans(),
            lr=st.floats(min_value=0.1, max_value=0.9),
            epsilon=st.floats(min_value=1e-5, max_value=1e-2),
+           engine=st.sampled_from([None, "SIMD"]),
            **hu.gcs_cpu_only)
-    def test_adagrad_sgd(self, inputs, in_place, lr, epsilon,
+    def test_adagrad_sgd(self, inputs, in_place, lr, epsilon, engine,
                          gc, dc):
         w, grad, h = inputs
         h = np.abs(h) + 0.01
@@ -595,7 +597,7 @@
             ["w", "h", "grad", "lr"],
             ["w" if in_place else "grad_o",
              "h" if in_place else "h_o"],
-            epsilon=epsilon, device_option=gc)
+            epsilon=epsilon, engine=engine, device_option=gc)
         self.assertDeviceChecks(dc, op, [w, h, grad, lr], [0])
 
         self.assertReferenceChecks(gc, op, [w, h, grad, lr],
@@ -604,9 +606,10 @@
     @given(inputs=hu.tensors(n=3),
            lr=st.floats(min_value=0.1, max_value=0.9),
            epsilon=st.floats(min_value=1e-5, max_value=1e-2),
+           engine=st.sampled_from([None, "SIMD"]),
            **hu.gcs_cpu_only)
     def test_sparse_adagrad_sgd(self, inputs, lr, epsilon,
-                                gc, dc):
+                                engine, gc, dc):
         w, grad, h = inputs
         indices = np.arange(h.shape[0])
         indices = indices[indices % 2 == 0]
@@ -618,6 +621,7 @@
             ["param", "h", "indices", "grad", "lr"],
             ["param", "h"],
             epsilon=epsilon,
+            engine=engine,
             device_option=gc)
         self.assertDeviceChecks(
             dc, op, [w, h, indices, grad, lr], [0])
@@ -876,12 +880,12 @@
             sids = []
             for i, l in enumerate(lengths):
                 sids.extend(l * [i])
-            return (np.array(sids, dtype=int), )
+            return (np.array(sids, dtype=np.int32), )
 
         self.assertReferenceChecks(
             device_option=gc,
             op=op,
-            inputs=[np.array(lengths, dtype=int)],
+            inputs=[np.array(lengths, dtype=np.int32)],
             reference=op_ref)
 
     @given(lengths=st.lists(st.integers(min_value=0, max_value=10),
@@ -903,7 +907,7 @@
         self.assertReferenceChecks(
             device_option=gc,
             op=op,
-            inputs=[np.array(lengths, dtype=int)],
+            inputs=[np.array(lengths, dtype=np.int32)],
             reference=op_ref)
 
     @given(prediction=hu.arrays(dims=[10, 3],
@@ -970,7 +974,7 @@
         def ids_to_lengths(ids):
             ids_length = len(ids)
             if ids_length == 0:
-                return (np.array([], dtype=int),)
+                return (np.array([], dtype=np.int32),)
 
             lengths = []
             # segment id starts with 0
@@ -988,14 +992,68 @@
                     tmp_length = 0
                 tmp_length += 1
             lengths.append(tmp_length)
-            return (np.array(lengths, dtype=int),)
+            return (np.array(lengths, dtype=np.int32),)
 
         self.assertReferenceChecks(
             device_option=gc,
             op=op,
-            inputs=[np.array(segment_ids, dtype=int)],
+            inputs=[np.array(segment_ids, dtype=np.int32)],
             reference=ids_to_lengths)
 
+    @given(lengths=st.lists(st.integers(min_value=1, max_value=10),
+                            min_size=0,
+                            max_size=10),
+            power=st.sampled_from([0.5, 1.0, 1.5, 2.0]),
+           **hu.gcs_cpu_only)
+    def test_segment_ids_to_lengths_weight(self, lengths, power, gc, dc):
+        op = core.CreateOperator(
+            "SegmentIdsToLengthWeights",
+            ["segment_ids"],
+            ["lengths"],
+            power=power)
+
+        def lengths_to_ids(lengths):
+            sids = []
+            for i, l in enumerate(lengths):
+                sids.extend(l * [i])
+            return sids
+
+        segment_ids = lengths_to_ids(lengths)
+
+        def ids_to_length_weights(ids):
+            ids_length = len(ids)
+            if ids_length == 0:
+                return (np.array([], dtype=float),)
+
+            lengths = []
+            # segment id starts with 0
+            prev_id = -1
+            tmp_length = 0
+            for idx in range(ids_length):
+                cur_id = ids[idx]
+                if cur_id != prev_id:
+                    if idx != 0:
+                        lengths.append(tmp_length)
+                    while prev_id + 1 != cur_id:
+                        lengths.append(0)
+                        prev_id += 1
+                    prev_id = cur_id
+                    tmp_length = 0
+                tmp_length += 1
+            lengths.append(tmp_length)
+
+            weighted_length = []
+            for l in lengths:
+                weighted_length.extend(l * [1 / pow(l, power)])
+
+            return (np.array(weighted_length, dtype=float),)
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[np.array(segment_ids, dtype=np.int32)],
+            reference=ids_to_length_weights)
+
     @given(input_tensor=hu.arrays(
         dims=[10], elements=st.floats(allow_nan=False,
                                       allow_infinity=False)),
diff --git a/caffe2/python/hypothesis_test_util.py b/caffe2/python/hypothesis_test_util.py
index aeb2620..2de9fc0 100644
--- a/caffe2/python/hypothesis_test_util.py
+++ b/caffe2/python/hypothesis_test_util.py
@@ -109,6 +109,8 @@
 
 
 def segment_ids(size, is_sorted):
+    if size == 0:
+        return st.just(np.empty(shape=[0], dtype=np.int32))
     if is_sorted:
         return arrays(
             [size],
@@ -122,30 +124,73 @@
             elements=st.integers(min_value=0, max_value=2 * size))
 
 
-def segmented_tensor(min_dim=1, max_dim=4, dtype=np.float32, is_sorted=True,
-                     elements=None, **kwargs):
+def lengths(size, **kwargs):
+    # First generate number of boarders between segments
+    # Then create boarder values and add 0 and size
+    # By sorting and computing diff we convert them to lengths of
+    # possible 0 value
+    if size == 0:
+        return st.just(np.empty(shape=[0], dtype=np.int32))
+    return st.integers(
+        min_value=0, max_value=size - 1
+    ).flatmap(lambda num_boarders:
+        hypothesis.extra.numpy.arrays(
+            np.int32, num_boarders, elements=st.integers(
+                min_value=0, max_value=size
+            )
+        )
+    ).map(lambda x: np.append(x, np.array([0, size], dtype=np.int32))
+    ).map(sorted).map(np.diff)
+
+
+def segmented_tensor(
+    min_dim=1,
+    max_dim=4,
+    dtype=np.float32,
+    is_sorted=True,
+    elements=None,
+    segment_generator=segment_ids,
+    allow_empty=False,
+    **kwargs
+):
+    gen_empty = st.booleans() if allow_empty else st.just(False)
     data_dims_ = st.lists(dims(**kwargs), min_size=min_dim, max_size=max_dim)
+    data_dims_ = st.tuples(
+        gen_empty, data_dims_
+    ).map(lambda pair: ([0] if pair[0] else []) + pair[1])
     return data_dims_.flatmap(lambda data_dims: st.tuples(
         arrays(data_dims, dtype, elements),
-        segment_ids(data_dims[0], is_sorted=is_sorted),
+        segment_generator(data_dims[0], is_sorted=is_sorted),
     ))
 
 
+def lengths_tensor(*args, **kwargs):
+    return segmented_tensor(*args, segment_generator=lengths, **kwargs)
+
+
 def sparse_segmented_tensor(min_dim=1, max_dim=4, dtype=np.float32,
-                            is_sorted=True, elements=None, **kwargs):
+                            is_sorted=True, elements=None, allow_empty=False,
+                            segment_generator=segment_ids, **kwargs):
+    gen_empty = st.booleans() if allow_empty else st.just(False)
     data_dims_ = st.lists(dims(**kwargs), min_size=min_dim, max_size=max_dim)
-    all_dims_ = data_dims_.flatmap(lambda data_dims: st.tuples(
-        st.just(data_dims),
-        st.integers(min_value=1, max_value=data_dims[0]),
-    ))
+    all_dims_ = st.tuples(gen_empty, data_dims_).flatmap(
+        lambda pair: st.tuples(
+            st.just(pair[1]),
+            (st.integers(min_value=1, max_value=pair[1][0]) if not pair[0]
+             else st.just(0)),
+        ))
     return all_dims_.flatmap(lambda dims: st.tuples(
         arrays(dims[0], dtype, elements),
         arrays(dims[1], dtype=np.int64, elements=st.integers(
             min_value=0, max_value=dims[0][0] - 1)),
-        segment_ids(dims[1], is_sorted=is_sorted),
+        segment_generator(dims[1], is_sorted=is_sorted),
     ))
 
 
+def sparse_lengths_tensor(**kwargs):
+    return sparse_segmented_tensor(segment_generator=lengths, **kwargs)
+
+
 def tensors(n, min_dim=1, max_dim=4, dtype=np.float32, elements=None, **kwargs):
     dims_ = st.lists(dims(**kwargs), min_size=min_dim, max_size=max_dim)
     return dims_.flatmap(
diff --git a/caffe2/python/model_helper.py b/caffe2/python/model_helper.py
new file mode 100644
index 0000000..26b8b97
--- /dev/null
+++ b/caffe2/python/model_helper.py
@@ -0,0 +1,130 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from caffe2.python import core
+
+import logging
+
+
+class ModelHelperBase(object):
+    """A helper model so we can write models more easily, without having to
+    manually define parameter initializations and operators separately.
+    In order to add support for specific operators, inherit from this class
+    and add corresponding methods. Operator representing methods should
+    take care of adding their parameters to params
+    """
+
+    def __init__(self, name=None, init_params=True, allow_not_known_ops=True):
+        if name is None:
+            name = "model"
+        self.net = core.Net(name)
+        self.param_init_net = core.Net(name + '_init')
+
+        self.param_to_grad = {}
+        self.params = []
+        self.gradient_ops_added = False
+        self.init_params = init_params
+        self.allow_not_known_ops = allow_not_known_ops
+
+    def Proto(self):
+        return self.net.Proto()
+
+    def InitProto(self):
+        return self.param_init_net.Proto()
+
+    def RunAllOnGPU(self, *args, **kwargs):
+        self.param_init_net.RunAllOnGPU(*args, **kwargs)
+        self.net.RunAllOnGPU(*args, **kwargs)
+
+    def CreateDB(self, blob_out, db, db_type, **kwargs):
+        dbreader = self.param_init_net.CreateDB(
+            [], blob_out, db=db, db_type=db_type, **kwargs)
+        return dbreader
+
+    def AddGradientOperators(self, *args, **kwargs):
+        if self.gradient_ops_added:
+            raise RuntimeError("You cannot run AddGradientOperators twice.")
+        self.gradient_ops_added = True
+        grad_map = self.net.AddGradientOperators(*args, **kwargs)
+        for p in self.params:
+            if str(p) in grad_map:
+                self.param_to_grad[p] = grad_map[str(p)]
+        return grad_map
+
+    def TensorProtosDBInput(
+        self, unused_blob_in, blob_out, batch_size, db, db_type, **kwargs
+    ):
+        """TensorProtosDBInput."""
+        dbreader_name = "dbreader_" + db
+        dbreader = self.param_init_net.CreateDB(
+            [], dbreader_name,
+            db=db, db_type=db_type)
+        return self.net.TensorProtosDBInput(
+            dbreader, blob_out, batch_size=batch_size)
+
+    def AddOperator(self, op_type, inputs, parameters, *args, **kwargs):
+        """
+        Adds an operator to a model. Use parameters list
+        to specify which operator inputs are model parameters to be
+        optimized.
+
+        Example of usage:
+
+        model.SparseLengthsSum(
+             [embedding, indices, lengths],
+             parameters=[embedding],
+        )
+
+        Here embedding is a parameter to be optimized while indices
+        and lengths are not.
+        """
+
+        extra_parameters = filter(lambda x: (x not in inputs), parameters)
+        if len(extra_parameters) > 0:
+            raise Exception("Some parameters are not inputs: {}".format(
+                map(str, extra_parameters)
+            ))
+
+        self.params.extend(parameters)
+        return self.net.__getattr__(op_type)(inputs, *args, **kwargs)
+
+    def __getattr__(self, op_type):
+        """Catch-all for all other operators, mostly those without params."""
+        if not core.IsOperator(op_type):
+            raise RuntimeError(
+                'Method ' + op_type + ' is not a registered operator.'
+            )
+        # known_working_ops are operators that do not need special care.
+        known_working_ops = [
+            "Accuracy",
+            "Adam",
+            "AveragedLoss",
+            "Cast",
+            "EnsureCPUOutput",
+            "LabelCrossEntropy",
+            "LearningRate",
+            "Print",
+            "Sigmoid",
+            "Scale",
+            "Snapshot",
+            "Softmax",
+            "StopGradient",
+            "Summarize",
+            "Tanh",
+            "WeightedSum",
+            "SquaredL2Distance",
+            "FlattenToVec",
+            "NHWC2NCHW",
+            "ScatterWeightedSum",
+            "Squeeze",
+            "NCCLAllreduce",
+            "ConstantFill",
+            "Add",
+            "DequeueBlobs",
+        ]
+        if op_type not in known_working_ops:
+            assert self.allow_not_known_ops
+            logging.warning("You are creating an op that the ModelHelperBase "
+                            "does not recognize: {}.".format(op_type))
+        return self.net.__getattr__(op_type)
diff --git a/caffe2/python/models/resnet.py b/caffe2/python/models/resnet.py
new file mode 100644
index 0000000..30caa77
--- /dev/null
+++ b/caffe2/python/models/resnet.py
@@ -0,0 +1,255 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+'''
+Utility for creating ResNets
+See "Deep Residual Learning for Image Recognition" by He, Zhang et. al. 2015
+'''
+
+
+class ResNetBuilder():
+    '''
+    Helper class for constructing residual blocks.
+    '''
+    def __init__(self, model, prev_blob):
+        self.model = model
+        self.comp_count = 0
+        self.comp_idx = 0
+        self.prev_blob = prev_blob
+
+    def add_conv(self, in_filters, out_filters, kernel, stride=1, pad=0):
+        self.comp_idx += 1
+        self.prev_blob = self.model.Conv(
+            self.prev_blob,
+            'comp_%d_conv_%d' % (self.comp_count, self.comp_idx),
+            in_filters,
+            out_filters,
+            weight_init=("MSRAFill", {}),
+            kernel=kernel,
+            stride=stride,
+            pad=pad
+        )
+        return self.prev_blob
+
+    def add_relu(self):
+        self.prev_blob = self.model.Relu(
+            self.prev_blob,
+            'comp_%d_relu_%d' % (self.comp_count, self.comp_idx)
+        )
+        return self.prev_blob
+
+    def add_spatial_bn(self, num_filters):
+        self.prev_blob = self.model.SpatialBN(
+            self.prev_blob,
+            'comp_%d_spatbn_%d' % (self.comp_count, self.comp_idx),
+            num_filters,
+            epsilon=1e-3
+        )
+        return self.prev_blob
+
+    '''
+    Add a "bottleneck" component as decribed in He et. al. Figure 3 (right)
+    '''
+    def add_bottleneck(
+        self,
+        input_filters,   # num of feature maps from preceding layer
+        base_filters,    # num of filters internally in the component
+        output_filters,  # num of feature maps to output
+        down_sampling=False,
+        spatial_batch_norm=True,
+    ):
+        self.comp_idx = 0
+        shortcut_blob = self.prev_blob
+
+        # 1x1
+        self.add_conv(
+            input_filters,
+            base_filters,
+            kernel=1,
+            stride=1
+        )
+
+        if spatial_batch_norm:
+            self.add_spatial_bn(base_filters)
+
+        self.add_relu()
+
+        # 3x3 (note the pad, required for keeping dimensions)
+        self.add_conv(
+            base_filters,
+            base_filters,
+            kernel=3,
+            stride=(1 if down_sampling is False else 2),
+            pad=1
+        )
+
+        if spatial_batch_norm:
+            self.add_spatial_bn(base_filters)
+        self.add_relu()
+
+        # 1x1
+        last_conv = self.add_conv(base_filters, output_filters, kernel=1)
+        if spatial_batch_norm:
+            last_conv = self.add_spatial_bn(output_filters)
+
+        # Summation with input signal (shortcut)
+        # If we need to increase dimensions (feature maps), need to
+        # do do a projection for the short cut
+        if (output_filters > input_filters):
+            shortcut_blob = self.model.Conv(
+                shortcut_blob,
+                'shortcut_projection_%d' % self.comp_count,
+                input_filters,
+                output_filters,
+                weight_init=("MSRAFill", {}),
+                kernel=1,
+                stride=(1 if down_sampling is False else 2)
+            )
+            if spatial_batch_norm:
+                shortcut_blob = self.model.SpatialBN(
+                    shortcut_blob,
+                    'shortcut_projection_%d_spatbn' % self.comp_count,
+                    output_filters,
+                    epsilon=1e-3,
+                )
+
+        self.prev_blob = self.model.Sum(
+            [shortcut_blob, last_conv],
+            'comp_%d_sum_%d' % (self.comp_count, self.comp_idx)
+        )
+        self.comp_idx += 1
+        self.add_relu()
+
+        # Keep track of number of high level components if this ResNetBuilder
+        self.comp_count += 1
+
+    def add_simple_block(
+        self,
+        input_filters,
+        num_filters,
+        down_sampling=False,
+        spatial_batch_norm=True
+    ):
+        self.comp_idx = 0
+        shortcut_blob = self.prev_blob
+
+        # 3x3
+        self.add_conv(
+            input_filters,
+            num_filters,
+            kernel=3,
+            stride=(1 if down_sampling is False else 2),
+            pad=1
+        )
+
+        if spatial_batch_norm:
+            self.add_spatial_bn(num_filters)
+        self.add_relu()
+
+        last_conv = self.add_conv(num_filters, num_filters, kernel=3, pad=1)
+        if spatial_batch_norm:
+            last_conv = self.add_spatial_bn(num_filters)
+
+        # Increase of dimensions, need a projection for the shortcut
+        if (num_filters != input_filters):
+            shortcut_blob = self.model.Conv(
+                shortcut_blob,
+                'shortcut_projection_%d' % self.comp_count,
+                input_filters,
+                num_filters,
+                weight_init=("MSRAFill", {}),
+                kernel=1,
+                stride=(1 if down_sampling is False else 2),
+            )
+            if spatial_batch_norm:
+                shortcut_blob = self.model.SpatialBN(
+                    shortcut_blob,
+                    'shortcut_projection_%d_spatbn' % self.comp_count,
+                    num_filters,
+                    epsilon=1e-3
+                )
+
+        self.prev_blob = self.model.Sum(
+            [shortcut_blob, last_conv],
+            'comp_%d_sum_%d' % (self.comp_count, self.comp_idx)
+        )
+        self.comp_idx += 1
+        self.add_relu()
+
+        # Keep track of number of high level components if this ResNetBuilder
+        self.comp_count += 1
+
+
+def create_resnet50(model, data, num_input_channels, num_labels):
+    # conv1 + maxpool
+    model.Conv(data, 'conv1', num_input_channels, 64, weight_init=("MSRAFill", {}), kernel=7, stride=2, pad=3)
+    model.SpatialBN('conv1', 'conv1_spatbn', 64, epsilon=1e-3)
+    model.Relu('conv1_spatbn', 'relu1')
+    model.MaxPool('relu1', 'pool1', kernel=3, stride=2)
+
+    # Residual blocks...
+    builder = ResNetBuilder(model, 'pool1')
+
+    # conv2_x (ref Table 1 in He et al. (2015))
+    builder.add_bottleneck(64, 64, 256)
+    builder.add_bottleneck(256, 64, 256)
+    builder.add_bottleneck(256, 64, 256)
+
+    # conv3_x
+    builder.add_bottleneck(256, 128, 512, down_sampling=True)
+    for i in range(1, 4):
+        builder.add_bottleneck(512, 128, 512)
+
+    # conv4_x
+    builder.add_bottleneck(512, 256, 1024, down_sampling=True)
+    for i in range(1, 6):
+        builder.add_bottleneck(1024, 256, 1024)
+
+    # conv5_x
+    builder.add_bottleneck(1024, 512, 2048, down_sampling=True)
+    builder.add_bottleneck(2048, 512, 2048)
+    builder.add_bottleneck(2048, 512, 2048)
+
+    # Final layers
+    model.AveragePool(builder.prev_blob, 'final_avg', kernel=7, stride=1)
+
+    # Final dimension of the "image" is reduced to 7x7
+    model.FC('final_avg', 'pred', 2048, num_labels)
+
+    softmax = model.Softmax('pred', 'softmax')
+    return softmax
+
+
+def create_resnet_32x32(
+    model, data, num_input_channels, num_groups, num_labels
+):
+    '''
+    Create residual net for smaller images (sec 4.2 of He et. al (2015))
+    num_groups = 'n' in the paper
+    '''
+    # conv1 + maxpool
+    model.Conv(data, 'conv1', num_input_channels, 16, kernel=3, stride=1)
+    model.SpatialBN('conv1', 'conv1_spatbn', 16, epsilon=1e-3)
+    model.Relu('conv1_spatbn', 'relu1')
+
+    # Number of blocks as described in sec 4.2
+    filters = [16, 32, 64]
+
+    builder = ResNetBuilder(model, 'relu1')
+    prev_filters = 16
+    for groupidx in range(0, 3):
+        for blockidx in range(0, 2 * num_groups):
+            builder.add_simple_block(
+                prev_filters if blockidx == 0 else filters[groupidx],
+                filters[groupidx],
+                down_sampling=(True if blockidx == 0 and
+                               groupidx > 0 else False))
+        prev_filters = filters[groupidx]
+
+    # Final layers
+    model.AveragePool(builder.prev_blob, 'final_avg', kernel=8, stride=1)
+    model.FC('final_avg', 'pred', 64, num_labels)
+    softmax = model.Softmax('pred', 'softmax')
+    return softmax
diff --git a/caffe2/python/muji.py b/caffe2/python/muji.py
index 837fe36..ec7b383 100644
--- a/caffe2/python/muji.py
+++ b/caffe2/python/muji.py
@@ -22,6 +22,12 @@
     return device_option
 
 
+def OnCPU():
+    device_option = caffe2_pb2.DeviceOption()
+    device_option.device_type = caffe2_pb2.CPU
+    return device_option
+
+
 def Allreduce(net, blobs, reduced_affix="_reduced", gpu_indices=None):
     """The general Allreduce interface that reroutes the function calls.
   """
diff --git a/caffe2/python/net_drawer.py b/caffe2/python/net_drawer.py
index 81f58ce..9645d85 100644
--- a/caffe2/python/net_drawer.py
+++ b/caffe2/python/net_drawer.py
@@ -59,18 +59,33 @@
     return json.dumps(name)
 
 
-def GetPydotGraph(operators_or_net, name=None, rankdir='LR'):
+def GetOpNodeProducer(append_output, **kwargs):
+    def ReallyGetOpNode(op, op_id):
+        if op.name:
+            node_name = '%s/%s (op#%d)' % (op.name, op.type, op_id)
+        else:
+            node_name = '%s (op#%d)' % (op.type, op_id)
+        if append_output:
+            for output_name in op.output:
+                node_name += '\n' + output_name
+        return pydot.Node(node_name, **kwargs)
+    return ReallyGetOpNode
+
+
+def GetPydotGraph(
+    operators_or_net,
+    name=None,
+    rankdir='LR',
+    node_producer=None
+):
+    if node_producer is None:
+        node_producer = GetOpNodeProducer(False, **OP_STYLE)
     operators, name = _rectify_operator_and_name(operators_or_net, name)
     graph = pydot.Dot(name, rankdir=rankdir)
     pydot_nodes = {}
     pydot_node_counts = defaultdict(int)
     for op_id, op in enumerate(operators):
-        if op.name:
-            op_node = pydot.Node(
-                '%s/%s (op#%d)' % (op.name, op.type, op_id), **OP_STYLE
-            )
-        else:
-            op_node = pydot.Node('%s (op#%d)' % (op.type, op_id), **OP_STYLE)
+        op_node = node_producer(op, op_id)
         graph.add_node(op_node)
         # print 'Op: %s' % op.name
         # print 'inputs: %s' % str(op.input)
@@ -104,9 +119,10 @@
 
 def GetPydotGraphMinimal(
     operators_or_net,
-    name,
+    name=None,
     rankdir='LR',
-    minimal_dependency=False
+    minimal_dependency=False,
+    node_producer=None,
 ):
     """Different from GetPydotGraph, hide all blob nodes and only show op nodes.
 
@@ -115,6 +131,8 @@
     op a and b, and op b depends on a, then only the edge b->c will be drawn
     because a->c will be implied.
     """
+    if node_producer is None:
+        node_producer = GetOpNodeProducer(False, **OP_STYLE)
     operators, name = _rectify_operator_and_name(operators_or_net, name)
     graph = pydot.Dot(name, rankdir=rankdir)
     # blob_parents maps each blob name to its generating op.
@@ -122,12 +140,7 @@
     # op_ancestry records the ancestors of each op.
     op_ancestry = defaultdict(set)
     for op_id, op in enumerate(operators):
-        if op.name:
-            op_node = pydot.Node(
-                '%s/%s (op#%d)' % (op.name, op.type, op_id), **OP_STYLE
-            )
-        else:
-            op_node = pydot.Node('%s (op#%d)' % (op.type, op_id), **OP_STYLE)
+        op_node = node_producer(op, op_id)
         graph.add_node(op_node)
         # Get parents, and set up op ancestry.
         parents = [
@@ -175,7 +188,7 @@
     return nodes
 
 
-def _draw_steps(steps, g, skip_step_edges=False):
+def _draw_steps(steps, g, skip_step_edges=False):  # noqa
     kMaxParallelSteps = 3
 
     def get_label():
@@ -253,6 +266,9 @@
         help="If set, only draw minimal dependency."
     )
     parser.add_argument(
+        "--append_output", action="store_true",
+        help="If set, append the output blobs to the operator names.")
+    parser.add_argument(
         "--rankdir", type=str, default="LR",
         help="The rank direction of the pydot graph."
     )
@@ -268,13 +284,17 @@
     for key, operators in graphs.items():
         if args.minimal:
             graph = GetPydotGraphMinimal(
-                operators, key,
+                operators,
+                name=key,
                 rankdir=args.rankdir,
+                node_producer=GetOpNodeProducer(args.append_output, **OP_STYLE),
                 minimal_dependency=args.minimal_dependency)
         else:
             graph = GetPydotGraph(
-                operators, key,
-                rankdir=args.rankdir)
+                operators,
+                name=key,
+                rankdir=args.rankdir,
+                node_producer=GetOpNodeProducer(args.append_output, **OP_STYLE))
         filename = args.output_prefix + graph.get_name() + '.dot'
         graph.write(filename, format='raw')
         pdf_filename = filename[:-3] + 'pdf'
diff --git a/caffe2/python/op/python_op.cpp b/caffe2/python/op/python_op.cpp
index 0d76cb9..0396ee7 100644
--- a/caffe2/python/op/python_op.cpp
+++ b/caffe2/python/op/python_op.cpp
@@ -64,7 +64,29 @@
       try {
         pyFunc(inputs, outputs);
       } catch (const py::error_already_set& e) {
-        LOG(ERROR) << "Python exception: " << e.what();
+        LOG(ERROR) << "Exception encountered running PythonOp function: "
+                   << e.what() << "\nTraceback: ";
+        PyObject *type = nullptr, *value = nullptr, *trace = nullptr;
+        PyErr_Fetch(&type, &value, &trace);
+        PyTracebackObject* traceback =
+            reinterpret_cast<PyTracebackObject*>(trace);
+        vector<PyTracebackObject*> trace_vec;
+        while (traceback) {
+          trace_vec.push_back(traceback);
+          traceback = traceback->tb_next;
+        }
+        for (int i = trace_vec.size() - 1; i >= 0; --i) {
+          int line = trace_vec[i]->tb_lineno;
+          const char* filename =
+              PyString_AsString(trace_vec[i]->tb_frame->f_code->co_filename);
+          const char* funcname =
+              PyString_AsString(trace_vec[i]->tb_frame->f_code->co_name);
+          LOG(ERROR) << "    # " << trace_vec.size() - i - 1 << "  " << filename
+                     << " (" << line << "): " << funcname;
+        }
+        Py_XDECREF(type);
+        Py_XDECREF(value);
+        Py_XDECREF(trace);
         return false;
       }
     }
diff --git a/caffe2/python/op/python_test.py b/caffe2/python/op/python_test.py
index 9eed723..b3fa04d 100644
--- a/caffe2/python/op/python_test.py
+++ b/caffe2/python/op/python_test.py
@@ -10,6 +10,14 @@
 import numpy as np
 
 
+def SubFunctionThatThrowsRuntimeError():
+    raise RuntimeError("This is an intentional exception.")
+
+
+def MainOpFunctionThatThrowsRuntimeError(inputs, _):
+    return SubFunctionThatThrowsRuntimeError()
+
+
 class PythonOpTest(hu.HypothesisTestCase):
     @given(x=hu.tensor())
     def test_feed(self, x):
@@ -22,6 +30,11 @@
         workspace.FeedBlob("x", x)
         workspace.RunOperatorOnce(op)
 
+    def test_exception(self):
+        op = CreatePythonOperator(MainOpFunctionThatThrowsRuntimeError, [], [])
+        with self.assertRaises(RuntimeError):
+            workspace.RunOperatorOnce(op)
+
     @given(x=hu.tensor())
     def test_feed_with_helper_function(self, x):
         def f(inputs, _):
@@ -65,7 +78,7 @@
         def f(inputs, outputs):
             try:
                 raise Exception("Exception in handler")
-            except:
+            except Exception:
                 pass
 
         op = CreatePythonOperator(f, ["x"], ["y"])
diff --git a/caffe2/python/operator_test/conv_test.py b/caffe2/python/operator_test/conv_test.py
index 5f7ed43..77a3981 100644
--- a/caffe2/python/operator_test/conv_test.py
+++ b/caffe2/python/operator_test/conv_test.py
@@ -26,6 +26,7 @@
            batch_size=st.integers(1, 3),
            order=st.sampled_from(["NCHW", "NHWC"]),
            engine=st.sampled_from(["", "EIGEN"]),
+           shared_buffer=st.booleans(),
            **hu.gcs)
     @settings(max_examples=2, timeout=100)
     def test_convolution_separate_stride_pad_gradients(self, stride_h, stride_w,
@@ -34,7 +35,8 @@
                                                        input_channels,
                                                        output_channels,
                                                        batch_size, order,
-                                                       engine, gc, dc):
+                                                       engine, shared_buffer,
+                                                       gc, dc):
         op = core.CreateOperator(
             "Conv",
             ["X", "w", "b"],
@@ -48,6 +50,7 @@
             kernel=kernel,
             order=order,
             engine=engine,
+            shared_buffer=int(shared_buffer),
         )
         X = np.random.rand(
             batch_size, size, size, input_channels).astype(np.float32) - 0.5
diff --git a/caffe2/python/operator_test/conv_transpose_test.py b/caffe2/python/operator_test/conv_transpose_test.py
index 1669947..b7b008f 100644
--- a/caffe2/python/operator_test/conv_transpose_test.py
+++ b/caffe2/python/operator_test/conv_transpose_test.py
@@ -19,11 +19,13 @@
            input_channels=st.integers(1, 8),
            output_channels=st.integers(1, 8),
            batch_size=st.integers(1, 3),
-           engine=st.sampled_from(["", "CUDNN"]), **hu.gcs)
+           engine=st.sampled_from(["", "CUDNN"]),
+           shared_buffer=st.booleans(),
+           **hu.gcs)
     def test_convolution_transpose_layout(self, stride, pad, kernel, adj,
                                           size, input_channels,
                                           output_channels, batch_size,
-                                          engine, gc, dc):
+                                          engine, shared_buffer, gc, dc):
         assume(adj < stride)
         X = np.random.rand(
             batch_size, size, size, input_channels).astype(np.float32) - 0.5
@@ -43,6 +45,7 @@
                 adj=adj,
                 order=order,
                 engine=engine,
+                shared_buffer=int(shared_buffer),
                 device_option=gc,
             )
             if order == "NCHW":
diff --git a/caffe2/python/operator_test/dataset_ops_test.py b/caffe2/python/operator_test/dataset_ops_test.py
index 69d7420..83d6128 100644
--- a/caffe2/python/operator_test/dataset_ops_test.py
+++ b/caffe2/python/operator_test/dataset_ops_test.py
@@ -314,3 +314,63 @@
             workspace.RunNet(str(read_next_net))
             actual = FetchRecord(batch)
             _assert_records_equal(actual, entry)
+
+    def test_collect_tensor_ops(self):
+        init_net = core.Net('init_net')
+        blobs = ['blob_1', 'blob_2', 'blob_3']
+        bvec_map = {}
+        ONE = init_net.ConstantFill([], 'ONE', shape=[1, 2], value=1)
+        for b in blobs:
+            init_net.ConstantFill([], [b], shape=[1, 2], value=0)
+            bvec_map[b] = b + '_vec'
+            init_net.CreateTensorVector([], [bvec_map[b]])
+
+        reader_net = core.Net('reader_net')
+        for b in blobs:
+            reader_net.Add([b, ONE], [b])
+
+        collect_net = core.Net('collect_net')
+        num_to_collect = 1000
+        max_example_to_cover = 100000
+        for i, b in enumerate(blobs):
+            if i == 0:
+                bvec_map[b], position = collect_net.CollectTensor(
+                    [bvec_map[b], b], [bvec_map[b], 'position'],
+                    num_to_collect=num_to_collect)
+            else:
+                # sample in the same way as the first blob
+                bvec_map[b], position = collect_net.CollectTensor(
+                    [bvec_map[b], b, position], [bvec_map[b], position],
+                    num_to_collect=num_to_collect)
+
+        print('Collect Net Proto: {}'.format(collect_net.Proto()))
+
+        plan = core.Plan('collect_data')
+        plan.AddStep(core.execution_step('collect_init', init_net))
+        plan.AddStep(core.execution_step('collect_data',
+                                         [reader_net, collect_net],
+                                         num_iter=max_example_to_cover))
+        workspace.RunPlan(plan)
+
+        # concat the collected tensors
+        concat_net = core.Net('concat_net')
+        bconcated_map = {}
+        for b in blobs:
+            bconcated_map[b] = b + '_concated'
+            concat_net.ConcatTensorVector([bvec_map[b]], [bconcated_map[b]])
+
+        workspace.RunNetOnce(concat_net)
+
+        # check data
+        reference_result = workspace.FetchBlob(bconcated_map[blobs[0]])
+        self.assertEqual(reference_result.shape,
+                         (min(num_to_collect, max_example_to_cover), 2))
+
+        hist, _ = np.histogram(reference_result[:, 0], bins=10,
+                               range=(1, max_example_to_cover))
+        print('Sample histogram: {}'.format(hist))
+
+        self.assertTrue(all(hist > 0.7 * (num_to_collect / 10)))
+        for i in range(1, len(blobs)):
+            result = workspace.FetchBlob(bconcated_map[blobs[i]])
+            self.assertEqual(reference_result.tolist(), result.tolist())
diff --git a/caffe2/python/operator_test/gather_ranges_op_test.py b/caffe2/python/operator_test/gather_ranges_op_test.py
new file mode 100644
index 0000000..7183bcf
--- /dev/null
+++ b/caffe2/python/operator_test/gather_ranges_op_test.py
@@ -0,0 +1,69 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from hypothesis import given
+from hypothesis import strategies as st
+
+import caffe2.python.hypothesis_test_util as hu
+import numpy as np
+
+
+def batched_boarders_and_data(
+        data_min_size=5, data_max_size=10,
+        examples_min_number=1, examples_max_number=4,
+        example_min_size=1, example_max_size=3,
+        dtype=np.float32, elements=None):
+    dims_ = st.tuples(
+        st.integers(min_value=data_min_size,
+                    max_value=data_max_size),
+        st.integers(min_value=examples_min_number,
+                    max_value=examples_max_number),
+        st.integers(min_value=example_min_size,
+                    max_value=example_max_size),
+    )
+    return dims_.flatmap(
+        lambda dims: st.tuples(
+            hu.arrays(
+                [dims[1], dims[2], 2], dtype=np.int32,
+                elements=st.integers(min_value=0, max_value=dims[0])
+            ),
+            hu.arrays([dims[0]], dtype, elements)
+    ))
+
+
+def gather_ranges(data, ranges):
+    lengths = []
+    output = []
+    for example_ranges in ranges:
+        length = 0
+        for range in example_ranges:
+            assert len(range) == 2
+            output.extend(data[range[0]:range[0] + range[1]])
+            length += range[1]
+        lengths.append(length)
+    return output, lengths
+
+
+class TestGatherRanges(hu.HypothesisTestCase):
+    @given(boarders_and_data=batched_boarders_and_data(), **hu.gcs_cpu_only)
+    def test_gather_ranges(self, boarders_and_data, gc, dc):
+        boarders, data = boarders_and_data
+
+        def boarders_to_range(boarders):
+            assert len(boarders) == 2
+            boarders = sorted(boarders)
+            return [boarders[0], boarders[1] - boarders[0]]
+
+        ranges = np.apply_along_axis(boarders_to_range, 2, boarders)
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=core.CreateOperator("GatherRanges",
+                                   ["data", "ranges"],
+                                   ["output", "lengths"]),
+            inputs=[data, ranges],
+            reference=gather_ranges,
+        )
diff --git a/caffe2/python/operator_test/matmul_op_test.py b/caffe2/python/operator_test/matmul_op_test.py
index 5d27339..b656d21 100644
--- a/caffe2/python/operator_test/matmul_op_test.py
+++ b/caffe2/python/operator_test/matmul_op_test.py
@@ -46,3 +46,42 @@
         self.assertGradientChecks(gc, op, [X, Y], 0, [0])
         # Gradient check wrt Y
         self.assertGradientChecks(gc, op, [X, Y], 1, [0])
+
+class TestBatchMatMul(hu.HypothesisTestCase):
+    @given(C=st.integers(min_value=1, max_value=10),
+           M=st.integers(min_value=1, max_value=10),
+           K=st.integers(min_value=1, max_value=10),
+           N=st.integers(min_value=1, max_value=10),
+           trans_a=st.booleans(),
+           trans_b=st.booleans(),
+           **hu.gcs)
+    def test_matmul(self, C, M, K, N, trans_a, trans_b, gc, dc):
+        X = np.random.randn(C, M, K).astype(np.float32)
+        if trans_a:
+            X = X.swapaxes(1, 2)
+
+        Y = np.random.randn(C, K, N).astype(np.float32)
+        if trans_b:
+            Y = Y.swapaxes(1, 2)
+
+        op = core.CreateOperator(
+            'BatchMatMul', ['X', 'Y'], 'out',
+            trans_a=trans_a, trans_b=trans_b)
+
+        def matmul_ref(X, Y, trans_a, trans_b):
+            XX = X.swapaxes(1, 2) if trans_a else X
+            YY = Y.swapaxes(1, 2) if trans_b else Y
+            output = np.zeros((C, M, N)).astype(XX.dtype)
+            for i in range(C):
+                output[i] = XX[i].dot(YY[i])
+            return (output,)
+
+        # Check against numpy reference
+        self.assertReferenceChecks(gc, op, [X, Y, trans_a, trans_b],
+                                   matmul_ref)
+        # Check over multiple devices
+        self.assertDeviceChecks(dc, op, [X, Y], [0])
+        # Gradient check wrt X
+        self.assertGradientChecks(gc, op, [X, Y], 0, [0])
+        # Gradient check wrt Y
+        self.assertGradientChecks(gc, op, [X, Y], 1, [0])
diff --git a/caffe2/python/operator_test/mkl_ops_test.py b/caffe2/python/operator_test/mkl_ops_test.py
new file mode 100644
index 0000000..4d34559
--- /dev/null
+++ b/caffe2/python/operator_test/mkl_ops_test.py
@@ -0,0 +1,64 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+import hypothesis.strategies as st
+from hypothesis import given
+import numpy as np
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+
+
+@unittest.skipIf(not core.IsOperator("PackedFC"),
+                 "PackedFC is not supported in this caffe2 build.")
+class PackedFCTest(hu.HypothesisTestCase):
+    @given(seed=st.integers(0, 65536),
+           M=st.integers(16, 32),
+           K=st.integers(128, 1024),
+           N=st.integers(128, 1024),
+           **hu.gcs_cpu_only)
+    def test_packed_fc(self, seed, M, K, N, gc, dc):
+        np.random.seed(seed)
+        X = np.random.rand(M, K).astype(np.float32) - 0.5
+        W = np.random.rand(N, K).astype(np.float32) - 0.5
+        b = np.random.rand(N).astype(np.float32) - 0.5
+
+        # If you are debugging, the following hard-coded ones might help.
+        # X = np.ones((24, 256)).astype(np.float32)
+        # W = np.ones((128, 256)).astype(np.float32)
+        # b = np.zeros(128).astype(np.float32)
+
+        def ref(X, W, b):
+            return (np.dot(X, W.T) + b,)
+
+        for name in ["FC", "PackedFC"]:
+            op = core.CreateOperator(
+                name,
+                ["X", "W", "b"],
+                ["Y"],
+            )
+            self.assertReferenceChecks(gc, op, [X, W, b], ref)
+
+    @given(axis=st.integers(min_value=1, max_value=4),
+           num_output=st.integers(min_value=4, max_value=8),
+           **hu.gcs_cpu_only)
+    def test_packed_fc_axis(self, axis, num_output, gc, dc):
+        np.random.seed(1701)
+        X = np.random.randn(1, 2, 3, 2, 1).astype(np.float32)
+        K = np.prod(X.shape[axis:])
+        N = num_output
+        W = np.random.randn(N, K).astype(np.float32)
+        b = np.random.randn(N).astype(np.float32)
+
+        op = core.CreateOperator(
+            "PackedFC",
+            ["X", "W", "b"],
+            ["Y"],
+            axis=axis)
+
+        def ref(X, W, b):
+            return (np.dot(X.reshape(X.size / K, K), W.T) + b,)
+
+        self.assertReferenceChecks(gc, op, [X, W, b], ref)
diff --git a/caffe2/python/operator_test/mpi_test.py b/caffe2/python/operator_test/mpi_test.py
index edcd0c8..96bc668 100644
--- a/caffe2/python/operator_test/mpi_test.py
+++ b/caffe2/python/operator_test/mpi_test.py
@@ -12,10 +12,7 @@
 from caffe2.python import core, workspace, dyndep
 import caffe2.python.hypothesis_test_util as hu
 
-if workspace.has_gpu_support:
-    dyndep.InitOpsLibrary("@/caffe2/caffe2/mpi:mpi_ops_gpu")
-else:
-    dyndep.InitOpsLibrary("@/caffe2/caffe2/mpi:mpi_ops")
+dyndep.InitOpsLibrary("@/caffe2/caffe2/mpi:mpi_ops")
 
 try:
     from mpi4py import MPI
diff --git a/caffe2/python/operator_test/partition_ops_test.py b/caffe2/python/operator_test/partition_ops_test.py
index a06c990..e43c3b1 100644
--- a/caffe2/python/operator_test/partition_ops_test.py
+++ b/caffe2/python/operator_test/partition_ops_test.py
@@ -8,6 +8,7 @@
 
 
 class TestPartitionOps(TestCase):
+
     def test_configs(self):
         # (main dims, partitions,  main type, [list of (extra dims, type)])
         configs = [
@@ -31,15 +32,15 @@
             for pack in [False, True]
         ]
 
-    def testSharding(self):
+    def testPartition(self):
         for main_dims, parts, main_type, extra_ins, pack in self.test_configs():
             ins = ['in' + str(i) for i in range(1 + len(extra_ins))]
             outs = [
                 'in{}_p{}'.format(i, j)
-                for i in range(1 + len(extra_ins)) for j in range(parts)
+                for i in range(parts) for j in range(1 + len(extra_ins))
             ]
             op = core.CreateOperator(
-                'Sharding', ins, outs, pack_first_input=(1 if pack else 0))
+                'Partition', ins, outs, pack_first_input=(1 if pack else 0))
             x = []
             for i, (dims, t) in enumerate([((), main_type)] + extra_ins):
                 if t in [np.float32, np.float64]:
@@ -54,21 +55,102 @@
                 # numpy has proper modulo op that yields non-negative results
                 shards = (x[0] % parts).reshape([-1])
                 out = []
-                for ind, v in enumerate(x):
-                    suffix_shape = v.shape[len(x[0].shape):]
-                    accum = [[] for i in range(parts)]
-                    a = v.reshape((-1, ) + suffix_shape)
-                    if pack and ind == 0:
-                        a //= parts
-                    for i, s in enumerate(shards):
-                        accum[s].append(a[i])
+                for i in range(parts):
+                    for ind, v in enumerate(x):
+                        suffix_shape = v.shape[len(x[0].shape):]
+                        accum = []
+                        data = v.reshape((-1, ) + suffix_shape)
 
-                    def join(a):
-                        if not a:
-                            return np.empty(shape=(0, ) + suffix_shape)
-                        return np.stack(a)
+                        if pack and ind == 0:
+                            data = data // parts
 
-                    out.extend(join(a) for a in accum)
+                        for j, s in enumerate(shards):
+                            if s == i:
+                                accum.append(data[j])
+
+                        def join(a):
+                            if not a:
+                                return np.empty(shape=(0, ) + suffix_shape)
+                            return np.stack(a)
+
+                        out.append(join(accum))
+                return out
+
+            workspace.RunOperatorOnce(op)
+            ref = sharding(x)
+            print(x)
+            print(ref)
+            for name, expected in zip(outs, ref):
+                np.testing.assert_array_equal(
+                    expected, workspace.FetchBlob(name)
+                )
+
+    def testLengthsPartition(self):
+        for main_dims, parts, main_type, extra_ins, pack in self.test_configs():
+            # For LengthsSharding only 1-D tensors supported as a first input
+            if len(main_dims) > 1:
+                continue
+            ins = ['in' + str(i) for i in range(2 + len(extra_ins))]
+            outs = [
+                'in{}_p{}'.format(j, i)
+                for i in range(parts) for j in range(2 + len(extra_ins))
+            ]
+            op = core.CreateOperator(
+                'LengthsPartition', ins, outs,
+                pack_first_input=(1 if pack else 0)
+            )
+            x = []
+            for i, (dims, t) in enumerate([((), main_type)] + extra_ins):
+                if t in [np.float32, np.float64]:
+                    d = rand_array(*(main_dims + dims))
+                else:
+                    d = np.random.randint(-100, 100, (main_dims + dims))
+                d = d.astype(t)
+                workspace.FeedBlob(ins[i + 1], d)
+                x.append(d)
+
+            # Randomly generate length tensor as well
+            elements = np.random.randint(2, 10)
+            lengths = []
+            total_length = 0
+            for i in range(elements - 1):
+                lengths.append(np.random.randint(main_dims[0] - total_length))
+                total_length += lengths[-1]
+            lengths.append(main_dims[0] - total_length)
+            workspace.FeedBlob(ins[0], np.array(lengths, dtype=np.int32))
+
+            def sharding(x):
+                # numpy has proper modulo op that yields non-negative results
+                shards = (x[0] % parts).reshape([-1])
+                out = []
+                for i in range(parts):
+                    idx = 0
+                    sharded_lengths = np.zeros(elements)
+                    for ind, length in enumerate(lengths):
+                        for j in range(length):
+                            if shards[idx] == i:
+                                sharded_lengths[ind] += 1
+                            idx += 1
+                    out.append(sharded_lengths)
+
+                    for ind, v in enumerate(x):
+                        suffix_shape = v.shape[len(x[0].shape):]
+                        accum = []
+                        data = v.reshape((-1, ) + suffix_shape)
+
+                        if pack and ind == 0:
+                            data = data // parts
+
+                        for j, s in enumerate(shards):
+                            if s == i:
+                                accum.append(data[j])
+
+                        def join(a):
+                            if not a:
+                                return np.empty(shape=(0, ) + suffix_shape)
+                            return np.stack(a)
+
+                        out.append(join(accum))
                 return out
 
             workspace.RunOperatorOnce(op)
diff --git a/caffe2/python/operator_test/reshape_ops_test.py b/caffe2/python/operator_test/reshape_ops_test.py
index bf31a66..49d8e18 100644
--- a/caffe2/python/operator_test/reshape_ops_test.py
+++ b/caffe2/python/operator_test/reshape_ops_test.py
@@ -10,10 +10,10 @@
 
 class TestLengthsToShapeOps(TestCase):
     def test_lengths_to_shape_ops(self):
-        workspace.FeedBlob('l', np.array([200, 200, 200], dtype=np.int64))
+        workspace.FeedBlob('l', np.array([200, 200, 200], dtype=np.int32))
         workspace.RunOperatorOnce(core.CreateOperator(
             'LengthsToShape', ['l'], ['s']))
-        workspace.FeedBlob('res', np.array([3, 200]))
+        workspace.FeedBlob('res', np.array([3, 200], dtype=np.int32))
         assert ((workspace.FetchBlob('s') == workspace.FetchBlob('res')).all())
 
     def test_reshape_ops(self):
diff --git a/caffe2/python/operator_test/segment_ops_test.py b/caffe2/python/operator_test/segment_ops_test.py
index d98c0bb..2a11949 100644
--- a/caffe2/python/operator_test/segment_ops_test.py
+++ b/caffe2/python/operator_test/segment_ops_test.py
@@ -5,70 +5,164 @@
 from caffe2.python import core
 from functools import partial
 from hypothesis import given
+
 import caffe2.python.hypothesis_test_util as hu
 import numpy as np
 
 
-def split(data, segment_ids, indices=None):
-    """
-    Given:
-      data[M1 x M2 x ... x Md]
-                      the input data
-      indices[N]      the index of each entry of segment_ids into data,
-                      where 0 <= index[i] < M1,
-                      with default indices=[0,1,...N]
-      segment_ids[N]  the segment_id for each entry of indices,
+class TesterBase:
+    def segment_reduce_op(self, data, segment_ids, reducer, indices=None):
+        segments = self.split(data, segment_ids, indices)
+        output = np.zeros((len(segments), ) + data.shape[1:])
+        for i, segment in enumerate(segments):
+            output[i] = reducer(segment)
+        return output
 
-    returns K outputs, each one containing data entries corresponding
-    to one of the segments present in `segment_ids`.
-    """
-    K = max(segment_ids) + 1
-    outputs = [
-        np.zeros(
-            (np.count_nonzero(segment_ids == seg_id),) + data.shape[1:],
-            dtype=data.dtype)
-        for seg_id in range(0, K)]
-    counts = np.zeros(K)
-    for i, seg_id in enumerate(segment_ids):
-        data_idx = i if indices is None else indices[i]
-        outputs[seg_id][counts[seg_id]] = data[data_idx]
-        counts[seg_id] += 1
-    return outputs
+    def segment_reduce_grad_op(
+        self,
+        data,
+        segment_ids,
+        reducer_grad,
+        grad_out,
+        output,
+        indices=None
+    ):
+        segments = self.split(data, segment_ids, indices)
+        segment_grads = [
+            reducer_grad(grad_out[i], [output[i]], [segment])
+            for i, segment in enumerate(segments)
+        ]
+        return self.unsplit(data.shape[1:], segment_grads, segment_ids)
+
+    def test(self, prefix, input_strategy, refs):
+        tester = self
+
+        @given(X=input_strategy, **hu.gcs_cpu_only)
+        def test_segment_ops(self, X, gc, dc):
+            for op_name, ref, grad_ref in refs:
+                inputs = ['input%d' % i for i in range(0, len(X))]
+                op = core.CreateOperator(prefix + op_name, inputs, ['output'])
+
+                def seg_reduce(data, *args):
+                    indices, segments = (
+                        args if len(args) == 2 else (None, args[0])
+                    )
+                    out = tester.segment_reduce_op(
+                        data=data,
+                        segment_ids=segments,
+                        indices=indices,
+                        reducer=ref
+                    )
+                    return (out, )
+
+                def seg_reduce_grad(grad_out, outputs, inputs):
+                    data = inputs[0]
+                    args = inputs[1:]
+                    indices, segments = (
+                        args if len(args) == 2 else (None, args[0])
+                    )
+                    # grad r.t. data
+                    grad_val = tester.segment_reduce_grad_op(
+                        data, segments, grad_ref, grad_out, outputs[0], indices
+                    )
+                    # if sparse, include indices along with data gradient
+                    data_grad_slice = (
+                        (grad_val, indices) if indices is not None else grad_val
+                    )
+                    # other inputs don't have gradient
+                    return (data_grad_slice, ) + (None, ) * (len(inputs) - 1)
+
+                self.assertReferenceChecks(
+                    device_option=gc,
+                    op=op,
+                    inputs=X,
+                    reference=seg_reduce,
+                    output_to_grad='output',
+                    grad_reference=seg_reduce_grad,
+                )
+
+        return test_segment_ops
 
 
-def unsplit(inputs, segment_ids):
-    """ Inverse operation to `split`, with indices=None """
-    output = np.zeros((len(segment_ids),) + inputs[0].shape[1:])
-    K = max(segment_ids) + 1
-    counts = np.zeros(K)
-    for i, seg_id in enumerate(segment_ids):
-        output[i] = inputs[seg_id][counts[seg_id]]
-        counts[seg_id] += 1
-    return output
+class SegmentsTester(TesterBase):
+    def split(self, data, segment_ids, indices=None):
+        """
+        Given:
+          data[M1 x M2 x ... x Md]
+                          the input data
+          indices[N]      the index of each entry of segment_ids into data,
+                          where 0 <= index[i] < M1,
+                          with default indices=[0,1,...N]
+          segment_ids[N]  the segment_id for each entry of indices,
+
+        returns K outputs, each one containing data entries corresponding
+        to one of the segments present in `segment_ids`.
+        """
+        if segment_ids.size == 0:
+            return []
+        K = max(segment_ids) + 1
+        outputs = [
+            np.zeros(
+                (np.count_nonzero(segment_ids == seg_id), ) + data.shape[1:],
+                dtype=data.dtype
+            ) for seg_id in range(0, K)
+        ]
+        counts = np.zeros(K)
+        for i, seg_id in enumerate(segment_ids):
+            data_idx = i if indices is None else indices[i]
+            outputs[seg_id][counts[seg_id]] = data[data_idx]
+            counts[seg_id] += 1
+        return outputs
+
+    def unsplit(self, extra_shape, inputs, segment_ids):
+        """ Inverse operation to `split`, with indices=None """
+        output = np.zeros((len(segment_ids), ) + extra_shape)
+        if len(segment_ids) == 0:
+            return output
+        K = max(segment_ids) + 1
+        counts = np.zeros(K)
+        for i, seg_id in enumerate(segment_ids):
+            output[i] = inputs[seg_id][counts[seg_id]]
+            counts[seg_id] += 1
+        return output
 
 
-def segment_reduce_op(data, segment_ids, reducer, indices=None):
-    segments = split(data, segment_ids, indices)
-    output = np.zeros((len(segments),) + data.shape[1:])
-    for i, segment in enumerate(segments):
-        output[i] = reducer(segment)
-    return output
+class LengthsTester(TesterBase):
+    def split(self, data, lengths, indices=None):
+        K = len(lengths)
+        outputs = [
+            np.zeros((lengths[seg_id], ) + data.shape[1:],
+                     dtype=data.dtype) for seg_id in range(0, K)
+        ]
+        start = 0
+        for i in range(0, K):
+            for j in range(0, lengths[i]):
+                data_index = start + j
+                if indices is not None:
+                    data_index = indices[data_index]
+                outputs[i][j] = data[data_index]
+            start += lengths[i]
+        return outputs
 
-
-def segment_reduce_grad_op(data, segment_ids, reducer_grad,
-                           grad_out, output, indices=None):
-    segments = split(data, segment_ids, indices)
-    segment_grads = [
-        reducer_grad(grad_out[i], [output[i]], [segment])
-        for i, segment in enumerate(segments)]
-    return unsplit(segment_grads, segment_ids)
+    def unsplit(self, extra_shape, inputs, lengths):
+        N = sum(lengths)
+        output = np.zeros((N, ) + extra_shape)
+        K = len(lengths)
+        assert len(inputs) == K
+        current = 0
+        for i in range(0, K):
+            for j in range(0, lengths[i]):
+                output[current] = inputs[i][j]
+                current += 1
+        return output
 
 
 def sum_grad(grad_out, outputs, inputs):
     return np.repeat(
         np.expand_dims(grad_out, axis=0),
         inputs[0].shape[0],
-        axis=0)
+        axis=0
+    )
 
 
 def logsumexp(x):
@@ -80,7 +174,8 @@
     return np.repeat(
         np.expand_dims(grad_out / sum_exps, 0),
         inputs[0].shape[0],
-        axis=0) * np.exp(inputs[0])
+        axis=0
+    ) * np.exp(inputs[0])
 
 
 def logmeanexp(x):
@@ -95,10 +190,11 @@
     return np.repeat(
         np.expand_dims(grad_out / inputs[0].shape[0], 0),
         inputs[0].shape[0],
-        axis=0)
+        axis=0
+    )
 
 
-def max(x):
+def max_fwd(x):
     return np.amax(x, axis=0)
 
 
@@ -122,9 +218,7 @@
     return np.resize(flat_grad_in, inputs[0].shape)
 
 
-REFERENCES_ALL = [
-    ('Sum', partial(np.sum, axis=0), sum_grad),
-]
+REFERENCES_ALL = [('Sum', partial(np.sum, axis=0), sum_grad), ]
 
 REFERENCES_SORTED = [
     ('RangeSum', partial(np.sum, axis=0), sum_grad),
@@ -132,75 +226,75 @@
     # gradient is the same as sum
     ('RangeLogMeanExp', logmeanexp, logsumexp_grad),
     ('RangeMean', mean, mean_grad),
-    ('RangeMax', max, max_grad),
+    ('RangeMax', max_fwd, max_grad),
 ]
 
 
-def test(prefix, input_strategy, refs):
-    @given(X=input_strategy, **hu.gcs_cpu_only)
-    def test_segment_ops(self, X, gc, dc):
-        for op_name, ref, grad_ref in refs:
-            inputs = ['input%d' % i for i in range(0, len(X))]
-            op = core.CreateOperator(prefix + op_name, inputs, ['output'])
-
-            def seg_reduce(data, *args):
-                indices, segment_ids = (
-                    args if len(args) == 2 else (None, args[0]))
-                out = segment_reduce_op(
-                    data=data,
-                    segment_ids=segment_ids,
-                    indices=indices,
-                    reducer=ref)
-                return (out,)
-
-            def seg_reduce_grad(grad_out, outputs, inputs):
-                data = inputs[0]
-                args = inputs[1:]
-                indices, segment_ids = (
-                    args if len(args) == 2 else (None, args[0]))
-                # grad r.t. data
-                grad_val = segment_reduce_grad_op(
-                    data, segment_ids, grad_ref,
-                    grad_out, outputs[0], indices)
-                # if sparse, include indices along with data gradient
-                data_grad_slice = (
-                    (grad_val, indices) if indices is not None else grad_val)
-                # other inputs don't have gradient
-                return (data_grad_slice,) + (None,) * (len(inputs) - 1)
-
-            self.assertReferenceChecks(
-                device_option=gc,
-                op=op,
-                inputs=X,
-                reference=seg_reduce,
-                output_to_grad='output',
-                grad_reference=seg_reduce_grad,
-            )
-
-    return test_segment_ops
-
-
 class TestSegmentOps(hu.HypothesisTestCase):
     def test_sorted_segment_ops(self):
-        test(
+        SegmentsTester().test(
             'SortedSegment',
-            hu.segmented_tensor(dtype=np.float32, is_sorted=True),
-            REFERENCES_ALL + REFERENCES_SORTED)(self)
+            hu.segmented_tensor(
+                dtype=np.float32,
+                is_sorted=True,
+                allow_empty=True
+            ),
+            REFERENCES_ALL + REFERENCES_SORTED
+        )(self)
 
     def test_unsorted_segment_ops(self):
-        test(
+        SegmentsTester().test(
             'UnsortedSegment',
-            hu.segmented_tensor(dtype=np.float32, is_sorted=False),
-            REFERENCES_ALL)(self)
+            hu.segmented_tensor(
+                dtype=np.float32,
+                is_sorted=False,
+                allow_empty=True
+            ),
+            REFERENCES_ALL
+        )(self)
 
     def test_sparse_sorted_segment_ops(self):
-        test(
+        SegmentsTester().test(
             'SparseSortedSegment',
-            hu.sparse_segmented_tensor(dtype=np.float32, is_sorted=True),
-            REFERENCES_ALL)(self)
+            hu.sparse_segmented_tensor(
+                dtype=np.float32,
+                is_sorted=True,
+                allow_empty=True
+            ),
+            REFERENCES_ALL
+        )(self)
 
     def test_sparse_unsorted_segment_ops(self):
-        test(
+        SegmentsTester().test(
             'SparseUnsortedSegment',
-            hu.sparse_segmented_tensor(dtype=np.float32, is_sorted=False),
-            REFERENCES_ALL)(self)
+            hu.sparse_segmented_tensor(
+                dtype=np.float32,
+                is_sorted=False,
+                allow_empty=True
+            ),
+            REFERENCES_ALL
+        )(self)
+
+    def test_lengths_ops(self):
+        LengthsTester().test(
+            'Lengths',
+            hu.lengths_tensor(
+                dtype=np.float32,
+                min_value=1,
+                max_value=10,
+                allow_empty=True
+            ),
+            REFERENCES_ALL
+        )(self)
+
+    def test_sparse_lengths_ops(self):
+        LengthsTester().test(
+            'SparseLengths',
+            hu.sparse_lengths_tensor(
+                dtype=np.float32,
+                min_value=1,
+                max_value=10,
+                allow_empty=True
+            ),
+            REFERENCES_ALL
+        )(self)
diff --git a/caffe2/python/operator_test/sequence_ops_test.py b/caffe2/python/operator_test/sequence_ops_test.py
index 75af83d..f833ae6 100644
--- a/caffe2/python/operator_test/sequence_ops_test.py
+++ b/caffe2/python/operator_test/sequence_ops_test.py
@@ -15,7 +15,7 @@
     def gen_with_size(args):
         lengths, inner_shape = args
         data_dim = [sum(lengths)] + inner_shape
-        lengths = np.array(lengths, dtype=np.int64)
+        lengths = np.array(lengths, dtype=np.int32)
         if with_pad_data:
             return st.tuples(
                 st.just(lengths),
diff --git a/caffe2/python/operator_test/sparse_gradient_checker_test.py b/caffe2/python/operator_test/sparse_gradient_checker_test.py
new file mode 100644
index 0000000..2a7036d
--- /dev/null
+++ b/caffe2/python/operator_test/sparse_gradient_checker_test.py
@@ -0,0 +1,42 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+from scipy.sparse import coo_matrix
+
+from hypothesis import given
+import hypothesis.strategies as st
+
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+
+
+class TestSparseGradient(hu.HypothesisTestCase):
+    @given(M=st.integers(min_value=5, max_value=20),
+           N=st.integers(min_value=5, max_value=20),
+           K=st.integers(min_value=5, max_value=15),
+           sparsity=st.floats(min_value=0.1, max_value=1.0),
+           **hu.gcs)
+    def test_sparse_gradient(self, M, N, K, sparsity, gc, dc):
+        X = np.random.randn(M, K).astype(np.float32)
+        X[X > sparsity] = 0
+        X_coo = coo_matrix(X)
+        val, key, seg = X_coo.data, X_coo.col, X_coo.row
+
+        val = val.astype(np.float32)
+        key = key.astype(np.int64)
+        seg = seg.astype(np.int32)
+
+        Y = np.random.randn(K, N).astype(np.float32)
+
+        op = core.CreateOperator(
+            'SparseUnsortedSegmentWeightedSum',
+            ['Y', 'val', 'key', 'seg'],
+            ['out'],
+            num_segments=M)
+
+        # Gradient check wrt Y
+        self.assertGradientChecks(
+            gc, op, [Y, val, key, seg], 0, [0])
diff --git a/caffe2/python/operator_test/spatial_bn_op_test.py b/caffe2/python/operator_test/spatial_bn_op_test.py
index 22548a0..1eaca27 100644
--- a/caffe2/python/operator_test/spatial_bn_op_test.py
+++ b/caffe2/python/operator_test/spatial_bn_op_test.py
@@ -93,9 +93,7 @@
            seed=st.integers(0, 65535),
            order=st.sampled_from(["NCHW", "NHWC"]),
            epsilon=st.floats(1e-5, 1e-2),
-           **hu.gcs_gpu_only)
-    @unittest.skipIf(not workspace.has_gpu_support,
-                     "SpatialBN gradient only implemented through gpu.")
+           **hu.gcs)
     def test_spatialbn_train_mode_gradient_check(
             self, size, input_channels, batch_size, seed, order, epsilon,
             gc, dc):
@@ -107,7 +105,7 @@
             is_test=False,
             epsilon=epsilon,
         )
-        np.random.seed(1701)
+        np.random.seed(seed)
         scale = np.random.rand(input_channels).astype(np.float32) + 0.5
         bias = np.random.rand(input_channels).astype(np.float32) - 0.5
         mean = np.random.randn(input_channels).astype(np.float32)
@@ -117,5 +115,6 @@
         if order == "NHWC":
             X = X.swapaxes(1, 2).swapaxes(2, 3)
 
-        self.assertGradientChecks(gc, op, [X, scale, bias, mean, var],
-                                  0, [0])
+        for input_to_check in [0, 1, 2]:  # dX, dScale, dBias
+            self.assertGradientChecks(gc, op, [X, scale, bias, mean, var],
+                                      input_to_check, [0])
diff --git a/caffe2/python/operator_test/square_root_divide_op_test.py b/caffe2/python/operator_test/square_root_divide_op_test.py
new file mode 100644
index 0000000..25099a6
--- /dev/null
+++ b/caffe2/python/operator_test/square_root_divide_op_test.py
@@ -0,0 +1,69 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from functools import partial
+from hypothesis import given
+from hypothesis import strategies as st
+
+import caffe2.python.hypothesis_test_util as hu
+import math
+import numpy as np
+
+
+def _data_and_scale(
+        data_min_size=4, data_max_size=10,
+        examples_min_number=1, examples_max_number=4,
+        dtype=np.float32, elements=None):
+    dims_ = st.tuples(
+        st.integers(min_value=examples_min_number,
+                    max_value=examples_max_number),
+        st.integers(min_value=data_min_size,
+                    max_value=data_max_size),
+    )
+    return dims_.flatmap(
+        lambda dims: st.tuples(
+            hu.arrays([dims[0], dims[1]], dtype=dtype),
+            hu.arrays(
+                [dims[0]], np.int32,
+                st.integers(min_value=5, max_value=10),
+            )
+        )
+    )
+
+
+def divide_by_square_root(data, scale):
+    output = np.copy(data)
+    num_examples = len(scale)
+
+    assert num_examples == data.shape[0]
+    assert len(data.shape) == 2
+
+    for i in range(0, num_examples):
+        if scale[i] > 0:
+            output[i] = np.multiply(data[i], 1 / math.sqrt(scale[i]))
+
+    return (output, )
+
+
+def grad(output_grad, ref_outputs, inputs):
+    return (divide_by_square_root(output_grad, inputs[1])[0],
+            None)
+
+
+class TestSquareRootDivide(hu.HypothesisTestCase):
+    @given(data_and_scale=_data_and_scale(),
+           **hu.gcs_cpu_only)
+    def test_square_root_divide(self, data_and_scale, gc, dc):
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=core.CreateOperator("SquareRootDivide",
+                                   ["data", "scale"],
+                                   ["output"]),
+            inputs=list(data_and_scale),
+            reference=partial(divide_by_square_root),
+            output_to_grad="output",
+            grad_reference=grad,
+        )
diff --git a/caffe2/python/operator_test/squeeze_test.py b/caffe2/python/operator_test/squeeze_test.py
deleted file mode 100644
index c5566ac..0000000
--- a/caffe2/python/operator_test/squeeze_test.py
+++ /dev/null
@@ -1,20 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-import numpy as np
-
-from caffe2.python import core, workspace
-from caffe2.python.test_util import TestCase
-
-
-class TestSqueezeOp(TestCase):
-    def test_squeeze_all(self):
-        # Testing that squeezing without dims works.
-        # With dims is covered in hypothesis_test
-        data = np.array([[[1]]], dtype=np.int32)
-        workspace.FeedBlob('data', data)
-        workspace.RunOperatorOnce(core.CreateOperator(
-            'Squeeze', ['data'], ['squeezed']))
-        result = workspace.FetchBlob('squeezed')
-        assert(np.array_equal(result, 1))
diff --git a/caffe2/python/pipeline.py b/caffe2/python/pipeline.py
new file mode 100644
index 0000000..1f8004a
--- /dev/null
+++ b/caffe2/python/pipeline.py
@@ -0,0 +1,129 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, queue_util
+from caffe2.python.dataio import Reader, Writer
+
+
+def processor_step(
+        reader, writer, num_threads=1, processor=None, name='processor'):
+    """
+    Given a reader and a writer, couple them through a processor, running
+    across multiple threads.
+
+    Args:
+        reader:      an instance of dataio.Reader
+        writer:      an instance of dataio.Wrier
+        num_threads: number of processing threads
+        processor:   if provided, a function taking form:
+                     (nets, out_record) = processor(record)
+                     where `record` is a schema.Struct containing the input,
+                     `nets` is the list of nets doing the transformation, and
+                     `out_record` is a schema.Struct with transformed data;
+        name:        Name to be given to nets and execution steps created.
+
+    Returns:
+        Execution step that runs all threads of the processor in parallel.
+    """
+    assert isinstance(reader, Reader)
+    assert isinstance(writer, Writer)
+    global_init_net = core.Net(name + '_producer_global_init')
+    global_exit_net = core.Net(name + '_producer_global_exit')
+
+    reader.setup_ex(global_init_net, global_exit_net)
+    writer.setup_ex(global_init_net, global_exit_net)
+
+    def default_processor(fields):
+        return [], fields
+
+    if processor is None:
+        processor = default_processor
+
+    steps = []
+    for thread_id in range(num_threads):
+        init_net = core.Net(name + "_init_net_%d" % thread_id)
+        exit_net = core.Net(name + "_exit_net_%d" % thread_id)
+
+        read_nets, status, rec = reader.read_record_ex(init_net, exit_net)
+        process_nets, rec = processor(rec)
+        write_nets, _ = writer.write_record_ex(rec, init_net, exit_net, status)
+
+        step = core.execution_step(
+            name + "_thread_%d" % thread_id, [
+                core.execution_step(name + "_init_step", init_net),
+                core.execution_step(
+                    name + "_worker_step",
+                    list(read_nets) + list(process_nets) + list(write_nets),
+                    should_stop_blob=status
+                ), core.execution_step(name + "_exit_step", exit_net)
+            ]
+        )
+        steps.append(step)
+
+    return core.execution_step(
+        "sender_step", [
+            core.execution_step('init_step', global_init_net),
+            core.execution_step(
+                "sender_steps", steps, concurrent_substeps=True),
+            core.execution_step('finish_step', global_exit_net),
+        ]
+    )
+
+
+class LocalPipeline(object):
+    """
+    Create a data processing pipeline consisting of a sequence of
+    multi-threaded processors communicating through queues.
+    """
+    def __init__(self):
+        self.tasks = []
+        self.init_net = core.Net('worker_init')
+
+    def create_queue(self, capacity, schema):
+        """
+        Create a queue that will be used to communicate between processors.
+
+        Args:
+            capacity: max number of records in the queue
+            schema:   a schema.Struct representing the schema of a record in
+                      the queue.
+
+        Returns:
+            A QueueWrapper containing a queue.
+        """
+        return queue_util.QueueWrapper(self.init_net, capacity, schema)
+
+    def add_task(self, task):
+        """
+        Add a task to the pipeline.
+        This task will run in parallel to other tasks in the pipeline.
+        """
+        self.tasks.append(task)
+
+    def link(self, reader, writer, num_threads=1, processor=None):
+        """
+        Add a task that will read from `reader`, and write to `writer`.
+        See function `processor_step` above for description of the arguments.
+        """
+        self.add_task(processor_step(reader, writer, num_threads, processor))
+
+    def get_step(self):
+        """
+        Create and return a Caffe2 execution step that will run all the tasks
+        of this pipeline in parallel.
+        """
+        return core.execution_step('worker_step', [
+            core.execution_step('worker_init', self.init_net),
+            core.execution_step(
+                'tasks_step', self.tasks, concurrent_substeps=True)
+        ])
+
+    def get_step_and_output(self):
+        """
+        Return a tuple (execution_step, output) to be used as one of the tasks
+        in a distributed pipeline.
+        """
+        output = self.init_net.ConstantFill([], value=0.0)
+        return self.get_step(), [output]
diff --git a/caffe2/python/pybind_state.cc b/caffe2/python/pybind_state.cc
index 68cb2ea..757d6bf 100644
--- a/caffe2/python/pybind_state.cc
+++ b/caffe2/python/pybind_state.cc
@@ -220,11 +220,19 @@
             py::gil_scoped_release g;
             CAFFE_ENFORCE(self->RunOperatorOnce(proto));
           })
-      .def("_run_plan", [](Workspace* self, py::bytes def) {
-        caffe2::PlanDef proto;
-        CAFFE_ENFORCE(proto.ParseFromString(def));
-        py::gil_scoped_release g;
-        CAFFE_ENFORCE(self->RunPlan(proto));
+      .def(
+          "_run_plan",
+          [](Workspace* self, py::bytes def) {
+            caffe2::PlanDef proto;
+            CAFFE_ENFORCE(proto.ParseFromString(def));
+            py::gil_scoped_release g;
+            CAFFE_ENFORCE(self->RunPlan(proto));
+          })
+      .def_property_readonly_static("current", [](py::object /* type */) {
+        auto ws = gWorkspaces.find(gCurrentWorkspaceName);
+        CAFFE_ENFORCE(ws != gWorkspaces.end());
+        CAFFE_ENFORCE(ws->second.get());
+        return py::cast(ws->second.get(), py::return_value_policy::reference);
       });
 
   // Gradients
@@ -369,6 +377,7 @@
       },
       "Reset the workspace",
       py::arg("root_folder") = py::none());
+
   m.def("root_folder", []() {
     CAFFE_ENFORCE(gWorkspace);
     return gWorkspace->RootFolder();
diff --git a/caffe2/python/pybind_state.h b/caffe2/python/pybind_state.h
index 8fb1e91..7f60efb 100644
--- a/caffe2/python/pybind_state.h
+++ b/caffe2/python/pybind_state.h
@@ -65,7 +65,7 @@
   pybind11::object Fetch(const Blob& blob) override {
     const Tensor<Context>& tensor = blob.Get<Tensor<Context>>();
     Context context;
-    CHECK_GE(tensor.size(), 0);
+    CAFFE_ENFORCE_GE(tensor.size(), 0, "Trying to fetch unitilized tensor");
     std::vector<npy_intp> npy_dims;
     for (const auto dim : tensor.dims()) {
       npy_dims.push_back(dim);
diff --git a/caffe2/python/queue_util.py b/caffe2/python/queue_util.py
index 1998f97..a703358 100644
--- a/caffe2/python/queue_util.py
+++ b/caffe2/python/queue_util.py
@@ -3,7 +3,64 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
-from caffe2.python import core
+from caffe2.python import core, dataio
+
+
+class QueueReader(dataio.Reader):
+    def __init__(self, queue, num_blobs=None, schema=None):
+        dataio.Reader.__init__(self, schema)
+        assert schema is not None or num_blobs is not None, (
+            'Either schema or num_blobs must be provided.')
+
+        self.queue = queue
+        self.num_blobs = num_blobs
+
+        if schema is not None:
+            schema_num_blobs = len(schema.field_names())
+            assert num_blobs is None or num_blobs == schema_num_blobs
+            self.num_blobs = schema_num_blobs
+
+    def setup_ex(self, init_net, exit_net):
+        exit_net.CloseBlobsQueue([self.queue], 0)
+
+    def read_ex(self, local_init_net, local_finish_net):
+        dequeue_net = core.Net('dequeue_net')
+        fields, status_blob = dequeue(dequeue_net, self.queue, self.num_blobs)
+        return [dequeue_net], status_blob, fields
+
+
+class QueueWriter(dataio.Writer):
+    def __init__(self, queue):
+        self.queue = queue
+
+    def setup_ex(self, init_net, exit_net):
+        exit_net.CloseBlobsQueue([self.queue], 0)
+
+    def write_ex(self, fields, local_init_net, local_finish_net, status):
+        enqueue_net = core.Net('enqueue_net')
+        enqueue(enqueue_net, self.queue, fields, status)
+        return [enqueue_net]
+
+
+class QueueWrapper(object):
+    def __init__(self, init_net, capacity, schema):
+        self._queue = init_net.CreateBlobsQueue(
+            [],
+            capacity=capacity,
+            num_blobs=len(schema.field_names()))
+        self._schema = schema
+
+    def reader(self):
+        return QueueReader(self._queue, schema=self._schema)
+
+    def writer(self):
+        return QueueWriter(self._queue)
+
+    def queue(self):
+        return self._queue
+
+    def schema(self):
+        return self._schema
 
 
 def enqueue(net, queue, data_blobs, status=None):
diff --git a/caffe2/python/schema.py b/caffe2/python/schema.py
index ab346a6..336e74a 100644
--- a/caffe2/python/schema.py
+++ b/caffe2/python/schema.py
@@ -37,6 +37,16 @@
         return ''
 
 
+def _normalize_field(field_or_type_or_blob):
+    """Clones/normalizes a field before adding it to a container."""
+    if isinstance(field_or_type_or_blob, Field):
+        return field_or_type_or_blob.clone()
+    elif type(field_or_type_or_blob) in (type, np.dtype):
+        return Scalar(dtype=field_or_type_or_blob)
+    else:
+        return Scalar(blob=field_or_type_or_blob)
+
+
 class Field(object):
     """Represents an abstract field type in a dataset.
     """
@@ -116,9 +126,8 @@
     the parent domain.
     """
     def __init__(self, values, lengths_blob=None):
-        assert isinstance(values, Field)
         self.lengths = Scalar(np.int32, lengths_blob)
-        self._items = values.clone()
+        self._items = _normalize_field(values)
         self.lengths._set_parent(self, 0)
         self._items._set_parent(self, 1)
         Field.__init__(self, [self.lengths, self._items])
@@ -160,8 +169,7 @@
             assert field[0], 'Field names cannot be empty'
             assert field[0] != 'lengths', (
                 'Struct cannot contain a field named `lengths`.')
-            assert isinstance(field[1], Field)
-        fields = [(name, field.clone()) for name, field in fields]
+        fields = [(name, _normalize_field(field)) for name, field in fields]
         for id, (name, field) in enumerate(fields):
             field._set_parent(self, id)
         self.fields = OrderedDict(fields)
@@ -191,6 +199,16 @@
     def clone(self):
         return Struct(*self.fields.items())
 
+    def __getitem__(self, item):
+        if isinstance(item, list) or isinstance(item, tuple):
+            return Struct(*[(
+                self.fields.keys()[k] if isinstance(k, int) else k, self[k])
+                for k in item])
+        elif isinstance(item, int):
+            return self.fields.values()[item]
+        else:
+            return self.fields[item]
+
     def __getattr__(self, item):
         if item.startswith('__'):
             raise AttributeError(item)
@@ -340,6 +358,23 @@
         lengths_blob=lengths_blob)
 
 
+def Tuple(*fields):
+    """
+    Creates a Struct with default, sequential, field names of given types.
+    """
+    return Struct(*[
+        ('field_%d' % i, field) for i, field in enumerate(fields)])
+
+
+def RawTuple(num_fields):
+    """
+    Creates a tuple of `num_field` untyped scalars.
+    """
+    assert isinstance(num_fields, int)
+    assert num_fields > 0
+    return Tuple(*([np.void] * num_fields))
+
+
 def from_dtype(dtype, _outer_shape=()):
     """Constructs a Caffe2 schema from the given numpy's dtype.
 
diff --git a/caffe2/python/schema_test.py b/caffe2/python/schema_test.py
index c9caf09..aea2c80 100644
--- a/caffe2/python/schema_test.py
+++ b/caffe2/python/schema_test.py
@@ -15,9 +15,69 @@
         s = schema.Struct(
             ('field1', schema.Scalar(dtype=np.int32)),
             ('field2', schema.List(
-                schema.Scalar(dtype=str))))
+                schema.Scalar(dtype=str)))
+        )
         s2 = pickle.loads(pickle.dumps(s))
         for r in (s, s2):
             self.assertTrue(isinstance(r.field1, schema.Scalar))
             self.assertTrue(isinstance(r.field2, schema.List))
             self.assertTrue(getattr(r, 'non_existent', None) is None)
+
+    def testNormalizeField(self):
+        s = schema.Struct(('field1', np.int32), ('field2', str))
+        self.assertEquals(
+            s,
+            schema.Struct(
+                ('field1', schema.Scalar(dtype=np.int32)),
+                ('field2', schema.Scalar(dtype=str))
+            )
+        )
+
+    def testTuple(self):
+        s = schema.Tuple(np.int32, str, np.float32)
+        s2 = schema.Struct(
+            ('field_0', schema.Scalar(dtype=np.int32)),
+            ('field_1', schema.Scalar(dtype=np.str)),
+            ('field_2', schema.Scalar(dtype=np.float32))
+        )
+        self.assertEquals(s, s2)
+        self.assertEquals(s[0], schema.Scalar(dtype=np.int32))
+        self.assertEquals(s[1], schema.Scalar(dtype=np.str))
+        self.assertEquals(s[2], schema.Scalar(dtype=np.float32))
+        self.assertEquals(
+            s[2, 0],
+            schema.Struct(
+                ('field_2', schema.Scalar(dtype=np.float32)),
+                ('field_0', schema.Scalar(dtype=np.int32)),
+            )
+        )
+        # test iterator behavior
+        for i, (v1, v2) in enumerate(zip(s, s2)):
+            self.assertEquals(v1, v2)
+            self.assertEquals(s[i], v1)
+            self.assertEquals(s2[i], v1)
+
+    def testRawTuple(self):
+        s = schema.RawTuple(2)
+        self.assertEquals(
+            s,
+            schema.Struct(
+                ('field_0', schema.Scalar()),
+                ('field_1', schema.Scalar())))
+        self.assertEquals(s[0], schema.Scalar())
+        self.assertEquals(s[1], schema.Scalar())
+
+    def testStructIndexing(self):
+        s = schema.Struct(
+            ('field1', schema.Scalar(dtype=np.int32)),
+            ('field2', schema.List(schema.Scalar(dtype=str)))
+        )
+        self.assertEquals(s['field2'], s.field2)
+        self.assertEquals(s['field2'], schema.List(schema.Scalar(dtype=str)))
+        self.assertEquals(
+            s['field2', 'field1'],
+            schema.Struct(
+                ('field2', schema.List(schema.Scalar(dtype=str))),
+                ('field1', schema.Scalar(dtype=np.int32)),
+            )
+        )
diff --git a/caffe2/python/scope.py b/caffe2/python/scope.py
index 18b3b42..497507c 100644
--- a/caffe2/python/scope.py
+++ b/caffe2/python/scope.py
@@ -21,6 +21,7 @@
 _NAMESCOPE_SEPARATOR = '/'
 
 
+# NOTE: using NameScope is NOT thread-safe! (TODO t13621185)
 @contextlib.contextmanager
 def NameScope(prefix, reset=False):
     global NAMESCOPE
diff --git a/caffe2/python/utils.py b/caffe2/python/utils.py
index f055cfd..6e91b46 100644
--- a/caffe2/python/utils.py
+++ b/caffe2/python/utils.py
@@ -42,6 +42,9 @@
 
     if isinstance(value, np.ndarray):
         value = value.flatten().tolist()
+    elif isinstance(value, np.generic):
+        # convert numpy scalar to native python type
+        value = np.asscalar(value)
 
     if type(value) is float:
         argument.f = value
diff --git a/caffe2/python/workspace.py b/caffe2/python/workspace.py
index 8dc1814..1ce3d625 100644
--- a/caffe2/python/workspace.py
+++ b/caffe2/python/workspace.py
@@ -108,6 +108,9 @@
             return obj.SerializeToString()
         elif hasattr(obj, 'Proto'):
             return obj.Proto().SerializeToString()
+        else:
+            raise ValueError("Unexpected argument to StringfyProto of type " +
+                             type(obj).__name__)
 
 
 def ResetWorkspace(root_folder=None):
@@ -155,8 +158,12 @@
     return C.run_net(StringifyNetName(name))
 
 
-def RunPlan(plan):
-    return C.run_plan(StringfyProto(plan))
+def RunPlan(plan_or_step):
+    # TODO(jiayq): refactor core.py/workspace.py to avoid circular deps
+    import caffe2.python.core as core
+    if isinstance(plan_or_step, core.ExecutionStep):
+        plan_or_step = core.Plan(plan_or_step)
+    return C.run_plan(StringfyProto(plan_or_step))
 
 
 def _StringifyName(name, expected_type):
diff --git a/caffe2/python/workspace_test.py b/caffe2/python/workspace_test.py
index cd5c117..b3afb7a 100644
--- a/caffe2/python/workspace_test.py
+++ b/caffe2/python/workspace_test.py
@@ -42,6 +42,15 @@
             workspace.RunNetOnce(self.net.Proto().SerializeToString()), True)
         self.assertEqual(workspace.HasBlob("testblob"), True)
 
+    def testCurrentWorkspaceWrapper(self):
+        self.assertNotIn("testblob", workspace.C.Workspace.current.blobs)
+        self.assertEqual(
+            workspace.RunNetOnce(self.net.Proto().SerializeToString()), True)
+        self.assertEqual(workspace.HasBlob("testblob"), True)
+        self.assertIn("testblob", workspace.C.Workspace.current.blobs)
+        workspace.ResetWorkspace()
+        self.assertNotIn("testblob", workspace.C.Workspace.current.blobs)
+
     def testRunPlan(self):
         plan = core.Plan("test-plan")
         plan.AddStep(core.ExecutionStep("test-step", self.net))
@@ -49,6 +58,11 @@
             workspace.RunPlan(plan.Proto().SerializeToString()), True)
         self.assertEqual(workspace.HasBlob("testblob"), True)
 
+    def testConstructPlanFromSteps(self):
+        step = core.ExecutionStep("test-step-as-plan", self.net)
+        self.assertEqual(workspace.RunPlan(step), True)
+        self.assertEqual(workspace.HasBlob("testblob"), True)
+
     def testResetWorkspace(self):
         self.assertEqual(
             workspace.RunNetOnce(self.net.Proto().SerializeToString()), True)
diff --git a/caffe2/queue/blobs_queue.h b/caffe2/queue/blobs_queue.h
index efd7f22..1bc6a07 100644
--- a/caffe2/queue/blobs_queue.h
+++ b/caffe2/queue/blobs_queue.h
@@ -74,29 +74,26 @@
     return true;
   }
 
-  bool blockingWrite(const std::vector<Blob*>& inputs) {
+  bool tryWrite(const std::vector<Blob*>& inputs) {
     auto keeper = this->shared_from_this();
     std::unique_lock<std::mutex> g(mutex_);
-    auto canWrite = [this]() {
-      // writer is always within [reader, reader + size)
-      // we can write if reader is within [reader, reader + size)
-      CHECK_LE(reader_, writer_);
-      CHECK_LE(writer_, reader_ + queue_.size());
-      return writer_ != reader_ + queue_.size();
-    };
-    cv_.wait(g, [this, canWrite]() { return closing_ || canWrite(); });
     if (!canWrite()) {
       return false;
     }
     DCHECK(canWrite());
-    auto& result = queue_[writer_ % queue_.size()];
-    CAFFE_ENFORCE(inputs.size() >= result.size());
-    for (auto i = 0; i < result.size(); ++i) {
-      using std::swap;
-      swap(*(inputs[i]), *(result[i]));
+    doWrite(inputs);
+    return true;
+  }
+
+  bool blockingWrite(const std::vector<Blob*>& inputs) {
+    auto keeper = this->shared_from_this();
+    std::unique_lock<std::mutex> g(mutex_);
+    cv_.wait(g, [this]() { return closing_ || canWrite(); });
+    if (!canWrite()) {
+      return false;
     }
-    ++writer_;
-    cv_.notify_all();
+    DCHECK(canWrite());
+    doWrite(inputs);
     return true;
   }
 
@@ -112,6 +109,25 @@
   }
 
  private:
+  bool canWrite() {
+    // writer is always within [reader, reader + size)
+    // we can write if reader is within [reader, reader + size)
+    CHECK_LE(reader_, writer_);
+    CHECK_LE(writer_, reader_ + queue_.size());
+    return writer_ != reader_ + queue_.size();
+  }
+
+  void doWrite(const std::vector<Blob*>& inputs) {
+    auto& result = queue_[writer_ % queue_.size()];
+    CAFFE_ENFORCE(inputs.size() >= result.size());
+    for (auto i = 0; i < result.size(); ++i) {
+      using std::swap;
+      swap(*(inputs[i]), *(result[i]));
+    }
+    ++writer_;
+    cv_.notify_all();
+  }
+
   std::atomic<bool> closing_{false};
 
   size_t numBlobs_;
diff --git a/caffe2/queue/queue_ops.cc b/caffe2/queue/queue_ops.cc
index d938394..8b073ed 100644
--- a/caffe2/queue/queue_ops.cc
+++ b/caffe2/queue/queue_ops.cc
@@ -3,6 +3,8 @@
 
 namespace caffe2 {
 
+CAFFE_KNOWN_TYPE(std::shared_ptr<BlobsQueue>);
+
 namespace {
 
 REGISTER_CPU_OPERATOR(CreateBlobsQueue, CreateBlobsQueueOp<CPUContext>);
diff --git a/caffe2/sgd/adagrad_op.h b/caffe2/sgd/adagrad_op.h
index d2917f4..47f5a54 100644
--- a/caffe2/sgd/adagrad_op.h
+++ b/caffe2/sgd/adagrad_op.h
@@ -19,7 +19,7 @@
   for (auto i = 0; i < N; ++i) {
     float gi = g[i];
     float hi = nh[i] = h[i] + gi * gi;
-    ng[i] = lr[0] * gi / (sqrt(hi) + epsilon);
+    ng[i] = lr[0] * gi / (std::sqrt(hi) + epsilon);
   }
 }
 
@@ -32,12 +32,12 @@
     float* nw,
     float* nh,
     float epsilon,
-    const float* lr,
+    float lr,
     Context* context) {
   for (auto i = 0; i < N; ++i) {
     float gi = g[i];
     float hi = nh[i] = h[i] + gi * gi;
-    nw[i] = w[i] + lr[0] * gi / (sqrt(hi) + epsilon);
+    nw[i] = w[i] + lr * gi / (std::sqrt(hi) + epsilon);
   }
 }
 
@@ -61,7 +61,7 @@
         Output(OUTPUT_PARAM)->template mutable_data<T>(),
         Output(OUTPUT_MOMENT_1)->template mutable_data<T>(),
         epsilon_,
-        Input(LR).template data<T>(),
+        Input(LR).template data<T>()[0],
         &context_);
     return true;
   }
@@ -92,7 +92,6 @@
     Output(OUTPUT_MOMENT_1)->ResizeLike(Input(MOMENT_1));
 
     auto n = Input(GRAD).dim(0);
-    auto block_size = Input(GRAD).size() / n;
 
     const auto* indices = Input(INDICES).template data<SIndex>();
     const auto* gradIn = Input(GRAD).template data<T>();
@@ -100,12 +99,18 @@
     const auto* momentIn = Input(MOMENT_1).template data<T>();
     auto* paramOut = Output(OUTPUT_PARAM)->template mutable_data<T>();
     auto* momentOut = Output(OUTPUT_MOMENT_1)->template mutable_data<T>();
+
+    if (n == 0) {
+      return true;
+    }
+
+    auto block_size = Input(GRAD).size_from_dim(1);
     for (auto i = 0; i < n; ++i) {
       auto idx = indices[i];
       if (block_size == 1) {
         float gi = gradIn[i];
         float hi = momentOut[idx] = momentIn[idx] + gi * gi;
-        paramOut[idx] = paramIn[idx] + lr[0] * gi / (sqrt(hi) + epsilon_);
+        paramOut[idx] = paramIn[idx] + lr[0] * gi / (std::sqrt(hi) + epsilon_);
       } else {
         auto offsetI = i * block_size;
         auto offsetIdx = idx * block_size;
@@ -117,7 +122,7 @@
             paramOut + offsetIdx,
             momentOut + offsetIdx,
             epsilon_,
-            lr,
+            lr[0],
             &context_);
       }
     }
diff --git a/caffe2/sgd/adagrad_op_gpu.cu b/caffe2/sgd/adagrad_op_gpu.cu
index cd5fe5b..dc47127 100644
--- a/caffe2/sgd/adagrad_op_gpu.cu
+++ b/caffe2/sgd/adagrad_op_gpu.cu
@@ -15,7 +15,7 @@
   CUDA_1D_KERNEL_LOOP(i, N) {
     float gi = g[i];
     float hi = nh[i] = h[i] + gi * gi;
-    ng[i] = lr[0] * gi / (sqrt(hi) + epsilon);
+    ng[i] = lr[0] * gi / (std::sqrt(hi) + epsilon);
   }
 }
 
diff --git a/caffe2/sgd/ftrl_op.cc b/caffe2/sgd/ftrl_op.cc
index f11c934..81e34b5 100644
--- a/caffe2/sgd/ftrl_op.cc
+++ b/caffe2/sgd/ftrl_op.cc
@@ -18,13 +18,13 @@
     T& nz,
     const FtrlParams<T>& params) {
   auto new_n = n + g * g;
-  auto sigma = (sqrt(new_n) - sqrt(n)) / params.alpha;
+  auto sigma = (sqrt(new_n) - sqrt(n)) * params.alphaInv;
   nn = new_n;
   nz = z + g - sigma * w;
   // update the weight
   if (std::abs(nz) > params.lambda1) {
     nw = (params.lambda1 * sgn(nz) - nz) /
-        ((params.beta + sqrt(new_n)) / params.alpha + params.lambda2);
+        ((params.beta + sqrt(new_n)) * params.alphaInv + params.lambda2);
   } else {
     nw = 0.0;
   }
diff --git a/caffe2/sgd/ftrl_op.h b/caffe2/sgd/ftrl_op.h
index 1d93530..27a0b09 100644
--- a/caffe2/sgd/ftrl_op.h
+++ b/caffe2/sgd/ftrl_op.h
@@ -7,11 +7,11 @@
 template <typename T>
 struct FtrlParams {
   explicit FtrlParams(OperatorBase* op)
-      : alpha(op->GetSingleArgument<float>("alpha", 0.005)),
+      : alphaInv(1.0 / op->GetSingleArgument<float>("alpha", 0.005)),
         beta(op->GetSingleArgument<float>("beta", 1.0)),
         lambda1(op->GetSingleArgument<float>("lambda1", 0.001)),
         lambda2(op->GetSingleArgument<float>("lambda2", 0.001)) {}
-  T alpha;
+  T alphaInv;
   T beta;
   T lambda1;
   T lambda2;
diff --git a/caffe2/sgd/iter_op.h b/caffe2/sgd/iter_op.h
index 063bf24..13f025e 100644
--- a/caffe2/sgd/iter_op.h
+++ b/caffe2/sgd/iter_op.h
@@ -34,11 +34,12 @@
 
   bool RunOnDevice() override {
     if (InputSize() == 0) {
-      LOG(ERROR) << "You are using an old definition of IterOp that will "
-                    "be deprecated soon. More specifically, IterOp now "
-                    "requires an explicit in-place input and output.";
       if (!OperatorBase::OutputIsType<TensorCPU>(0)) {
         // This is the first run; set the iter to start with 0.
+        LOG(ERROR) << "You are using an old definition of IterOp that will "
+                      "be deprecated soon. More specifically, IterOp now "
+                      "requires an explicit in-place input and output.";
+
         auto* output = OperatorBase::Output<TensorCPU>(0);
         VLOG(1) << "Initializing iter counter.";
         output->Resize(1);
diff --git a/caffe2/sgd/learning_rate_op.h b/caffe2/sgd/learning_rate_op.h
index d28f498..caec15b 100644
--- a/caffe2/sgd/learning_rate_op.h
+++ b/caffe2/sgd/learning_rate_op.h
@@ -19,7 +19,7 @@
                 "base_lr", FLT_MAX)) {
     CHECK_NE(base_lr_, FLT_MAX) << "Base learning rate must be set.";
     const string policy = OperatorBase::GetSingleArgument<string>("policy", "");
-    CHECK(policy.size()) << "Must specify a learning rate policy.";
+    CAFFE_ENFORCE(policy.size(), "Must specify a learning rate policy.");
     if (policy == "fixed") {
       functor_.reset(new FixedLearningRate<T>());
     } else if (policy == "step") {
diff --git a/caffe2/utils/mkl_utils.h b/caffe2/utils/mkl_utils.h
new file mode 100644
index 0000000..446001f
--- /dev/null
+++ b/caffe2/utils/mkl_utils.h
@@ -0,0 +1,59 @@
+#ifndef CAFFE2_UTILS_MKL_UTILS_H_
+#define CAFFE2_UTILS_MKL_UTILS_H_
+#ifdef CAFFE2_USE_MKL
+
+#include <mkl.h>
+
+#include "caffe2/core/logging.h"
+
+#if INTEL_MKL_VERSION >= 20170000
+#define CAFFE2_HAS_MKL_SGEMM_PACK
+#define CAFFE2_HAS_MKL_DNN
+
+namespace caffe2 {
+namespace mkl {
+
+struct MKLPackedMatrix {
+  char identifier_;
+  char trans_;
+  int m_;
+  int n_;
+  int k_;
+  float alpha_;
+  int ld_;
+  float* data_ = nullptr;
+
+  MKLPackedMatrix(
+      const char identifier,
+      const char trans,
+      const int m,
+      const int n,
+      const int k,
+      const float alpha,
+      const float* src,
+      const int ld)
+      : identifier_(identifier),
+        trans_(trans),
+        m_(m),
+        n_(n),
+        k_(k),
+        alpha_(alpha),
+        ld_(ld) {
+    data_ = sgemm_alloc(&identifier, &m, &n, &k);
+    CAFFE_ENFORCE(data_, "MKL runtime error: cannot allocate sgemm memory.");
+    sgemm_pack(&identifier, &trans, &m, &n, &k, &alpha, src, &ld, data_);
+  }
+
+  ~MKLPackedMatrix() {
+    if (data_) {
+      sgemm_free(data_);
+    }
+  }
+};
+
+} // namespace mkl
+} // namespace caffe2
+
+#endif // INTEL_MKL_VERSION >= 20170000
+#endif // CAFFE2_USE_MKL
+#endif // CAFFE2_UTILS_MKL_UTILS_H_
diff --git a/caffe2/utils/proto_utils.cc b/caffe2/utils/proto_utils.cc
index a65cd7e..8c19dd3 100644
--- a/caffe2/utils/proto_utils.cc
+++ b/caffe2/utils/proto_utils.cc
@@ -111,7 +111,7 @@
 void WriteProtoToTextFile(const Message& proto, const char* filename) {
   int fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644);
   FileOutputStream* output = new FileOutputStream(fd);
-  CHECK(google::protobuf::TextFormat::Print(proto, output));
+  CAFFE_ENFORCE(google::protobuf::TextFormat::Print(proto, output));
   delete output;
   close(fd);
 }
@@ -138,7 +138,7 @@
   std::unique_ptr<ZeroCopyOutputStream> raw_output(new FileOutputStream(fd));
   std::unique_ptr<CodedOutputStream> coded_output(
       new CodedOutputStream(raw_output.get()));
-  CHECK(proto.SerializeToCodedStream(coded_output.get()));
+  CAFFE_ENFORCE(proto.SerializeToCodedStream(coded_output.get()));
   coded_output.reset();
   raw_output.reset();
   close(fd);
diff --git a/caffe2/utils/proto_utils.h b/caffe2/utils/proto_utils.h
index 8a355d0..2ce6223 100644
--- a/caffe2/utils/proto_utils.h
+++ b/caffe2/utils/proto_utils.h
@@ -73,7 +73,7 @@
 using ::google::protobuf::Message;
 
 inline string ProtoDebugString(const Message& proto) {
-  return proto.DebugString();
+  return proto.ShortDebugString();
 }
 
 bool ReadProtoFromTextFile(const char* filename, Message* proto);
@@ -179,8 +179,9 @@
     CAFFE_ENFORCE(arg_map_.count(name), "Cannot find parameter named ", name);
     MessageType message;
     if (arg_map_.at(name)->has_s()) {
-      CHECK(message.ParseFromString(arg_map_.at(name)->s()))
-          << "Faild to parse content from the string";
+      CAFFE_ENFORCE(
+          message.ParseFromString(arg_map_.at(name)->s()),
+          "Faild to parse content from the string");
     } else {
       VLOG(1) << "Return empty message for parameter " << name;
     }
@@ -192,8 +193,9 @@
     CAFFE_ENFORCE(arg_map_.count(name), "Cannot find parameter named ", name);
     vector<MessageType> messages(arg_map_.at(name)->strings_size());
     for (int i = 0; i < messages.size(); ++i) {
-      CHECK(messages[i].ParseFromString(arg_map_.at(name)->strings(i)))
-          << "Faild to parse content from the string";
+      CAFFE_ENFORCE(
+          messages[i].ParseFromString(arg_map_.at(name)->strings(i)),
+          "Faild to parse content from the string");
     }
     return messages;
   }
diff --git a/caffe2/utils/string_utils.cc b/caffe2/utils/string_utils.cc
index 17afd2f..7d4d65a 100644
--- a/caffe2/utils/string_utils.cc
+++ b/caffe2/utils/string_utils.cc
@@ -17,112 +17,4 @@
   return pieces;
 }
 
-Tokenizer::Tokenizer(const std::vector<char>& delims, char escape)
-    : escape_(escape) {
-  reset();
-  std::memset(delimTable_, 0, sizeof(delimTable_));
-  for (int i = 0; i < delims.size(); ++i) {
-    delimTable_[(unsigned char)delims.at(i)] = i + 1;
-  }
-}
-
-void Tokenizer::reset() {
-  toBeSkipped_ = 0;
-  startDelimId_ = 0;
-  leftover_.clear();
-}
-
-void Tokenizer::next(char* start, char* end, TokenizedString& tokenized) {
-  tokenized.modifiedStrings_.clear();
-  tokenized.tokens_.clear();
-
-  char* currentStart = start;
-  std::string* copied = nullptr;
-  if (!leftover_.empty()) {
-    tokenized.modifiedStrings_.emplace_back(new std::string());
-    copied = tokenized.modifiedStrings_.back().get();
-    *copied = std::move(leftover_);
-  }
-
-  char* ch;
-  for (ch = start + toBeSkipped_; ch < end; ++ch) {
-    if (*ch == escape_) {
-      if (!copied) {
-        tokenized.modifiedStrings_.emplace_back(new std::string());
-        copied = tokenized.modifiedStrings_.back().get();
-      }
-      copied->append(currentStart, ch);
-      currentStart = ch + 1;
-      // skip next character, since it's escaped
-      ++ch;
-      continue;
-    }
-    int newDelimId = delimTable_[(unsigned char)*ch];
-    if (newDelimId > 0) {
-      // found delimiter
-      tokenized.tokens_.emplace_back();
-      auto& token = tokenized.tokens_.back();
-      token.startDelimId = startDelimId_;
-      if (copied) {
-        copied->append(currentStart, ch);
-        const char* c_str = copied->data();
-        token.start = c_str;
-        token.end = c_str + copied->size();
-      } else {
-        token.start = currentStart;
-        token.end = ch;
-      }
-      currentStart = ch + 1;
-      copied = nullptr;
-      startDelimId_ = newDelimId - 1;
-    }
-  }
-  tokenized.lastDelim_ = startDelimId_;
-
-  toBeSkipped_ = ch - end;
-  if (copied) {
-    copied->append(currentStart, end);
-    leftover_ = std::move(*copied);
-  } else {
-    leftover_.assign(currentStart, end);
-  }
-}
-
-FileReader::FileReader(const std::string& path, size_t bufferSize)
-    : bufferSize_(bufferSize), buffer_(new char[bufferSize]) {
-  fd_ = open(path.c_str(), O_RDONLY, 0777);
-  if (fd_ < 0) {
-    throw std::runtime_error(
-        "Error opening file for reading: " + std::string(std::strerror(errno)));
-  }
-}
-
-void FileReader::reset() {
-  if (lseek(fd_, 0, SEEK_SET) == -1) {
-    throw std::runtime_error(
-        "Error reseting file cursor: " + std::string(std::strerror(errno)));
-  }
-}
-
-FileReader::~FileReader() {
-  if (fd_ >= 0) {
-    close(fd_);
-  }
-}
-
-void FileReader::operator()(CharRange& range) {
-  char* buffer = buffer_.get();
-  auto numRead = read(fd_, buffer, bufferSize_);
-  if (numRead == -1) {
-    throw std::runtime_error(
-        "Error reading file: " + std::string(std::strerror(errno)));
-  }
-  if (numRead == 0) {
-    range.start = nullptr;
-    range.end = nullptr;
-    return;
-  }
-  range.start = buffer;
-  range.end = buffer + numRead;
-}
-}
+} // namespace caffe2
diff --git a/caffe2/utils/string_utils.h b/caffe2/utils/string_utils.h
index 0cd727a..02067bc 100644
--- a/caffe2/utils/string_utils.h
+++ b/caffe2/utils/string_utils.h
@@ -8,111 +8,4 @@
 
 std::vector<std::string> split(char separator, const std::string& string);
 
-struct Token {
-  int startDelimId;
-  const char* start;
-  const char* end;
-};
-
-class TokenizedString {
-  // holder for strings that have been modified
-  std::vector<std::unique_ptr<std::string>> modifiedStrings_;
-  std::vector<Token> tokens_;
-  int lastDelim_;
-
- public:
-  const std::vector<Token>& tokens() const {
-    return tokens_;
-  }
-  const int lastDelim() const {
-    return lastDelim_;
-  }
-  friend class Tokenizer;
-};
-
-class Tokenizer {
- private:
-  int startDelimId_;
-  // state of the tokenizer
-  std::string leftover_;
-  // if we need to skip the first characters of the next batch because
-  // e.g. a escape char that was the last character of the last batch.
-  int toBeSkipped_;
-  int delimTable_[256];
-  const char escape_;
-
- public:
-  Tokenizer(const std::vector<char>& delimiters, char escape);
-  void reset();
-  void next(char* start, char* end, TokenizedString& tokenized);
-};
-
-struct CharRange {
-  char* start;
-  char* end;
-};
-
-struct StringProvider {
-  virtual void operator()(CharRange&) = 0;
-  virtual void reset() = 0;
-  virtual ~StringProvider() {}
-};
-
-class BufferedTokenizer {
- public:
-  BufferedTokenizer(const Tokenizer& t, StringProvider* p, int numPasses = 1)
-      : provider_(p), tokenizer_(t), tokenIndex_(0), numPasses_(numPasses) {}
-
-  bool next(Token& token) {
-    CharRange range;
-    while (tokenIndex_ >= tokenized_.tokens().size()) {
-      range.start = nullptr;
-      while (range.start == nullptr && pass_ < numPasses_) {
-        (*provider_)(range);
-        if (range.start == nullptr) {
-          ++pass_;
-          if (pass_ < numPasses_) {
-            provider_->reset();
-            tokenizer_.reset();
-          }
-        }
-      }
-      if (range.start == nullptr) {
-        return false;
-      }
-      tokenizer_.next(range.start, range.end, tokenized_);
-      tokenIndex_ = 0;
-    }
-    token = tokenized_.tokens()[tokenIndex_++];
-    return true;
-  };
-
-  int endDelim() const {
-    if (tokenIndex_ + 1 < tokenized_.tokens().size()) {
-      return tokenized_.tokens()[tokenIndex_ + 1].startDelimId;
-    }
-    return tokenized_.lastDelim();
-  }
-
- private:
-  StringProvider* provider_;
-  Tokenizer tokenizer_;
-  TokenizedString tokenized_;
-  int tokenIndex_;
-  int numPasses_;
-  int pass_{0};
-};
-
-class FileReader : public StringProvider {
- public:
-  explicit FileReader(const std::string& path, size_t bufferSize = 65536);
-  ~FileReader();
-  void operator()(CharRange& range) override;
-  void reset() override;
-
- private:
-  const size_t bufferSize_;
-  int fd_;
-  std::unique_ptr<char[]> buffer_;
-};
-}
+} // namespace caffe2
diff --git a/caffe2/utils/zmq_helper.h b/caffe2/utils/zmq_helper.h
index d8d11f9..fbcdd25 100644
--- a/caffe2/utils/zmq_helper.h
+++ b/caffe2/utils/zmq_helper.h
@@ -10,7 +10,7 @@
 class ZmqContext {
  public:
   explicit ZmqContext(int io_threads) : ptr_(zmq_ctx_new()) {
-    CHECK(ptr_ != nullptr) << "Failed to create zmq context.";
+    CAFFE_ENFORCE(ptr_ != nullptr, "Failed to create zmq context.");
     int rc = zmq_ctx_set(ptr_, ZMQ_IO_THREADS, io_threads);
     CHECK_EQ(rc, 0);
     rc = zmq_ctx_set(ptr_, ZMQ_MAX_SOCKETS, ZMQ_MAX_SOCKETS_DFLT);
@@ -55,7 +55,7 @@
  public:
   explicit ZmqSocket(int type)
       : context_(1), ptr_(zmq_socket(context_.ptr(), type)) {
-    CHECK(ptr_ != nullptr) << "Faild to create zmq socket.";
+    CAFFE_ENFORCE(ptr_ != nullptr, "Faild to create zmq socket.");
   }
 
   ~ZmqSocket() {
@@ -97,7 +97,7 @@
   }
 
   int SendTillSuccess(const string& msg, int flags) {
-    CHECK(msg.size()) << "You cannot send an empty message.";
+    CAFFE_ENFORCE(msg.size(), "You cannot send an empty message.");
     int nbytes = 0;
     do {
       nbytes = Send(msg, flags);