fbsync
diff --git a/build.py b/build.py
index f148b1d..1da6135 100644
--- a/build.py
+++ b/build.py
@@ -24,7 +24,7 @@
# Otherwise, use the following line: we will build protobuf using the
# included source file.
#USE_SYSTEM_PROTOBUF = False
- #PROTOC_BINARY = 'gen/third_party/protoc'
+ #PROTOC_BINARY = 'gen/third_party/google/protoc'
# Note for the line above: if you are doing things like cross-compilation,
# the built protoc compiler will not work on the host, in which case you
# will need to provide a protoc binary that can run on the host environment.
diff --git a/build_android.py b/build_android.py
index b3aa329..5a71846 100644
--- a/build_android.py
+++ b/build_android.py
@@ -14,7 +14,7 @@
from build import Config
STANDALONE_TCHAIN_ROOT = (
- '/opt/android_ndk/android-ndk-r10e/'
+ '/Users/jiayq/Research/android-ndk-r12b/'
'standalone-toolchains/arm-linux-androideabi-4.9-android-21/')
# We change necessary components in the Config class.
@@ -24,7 +24,7 @@
Config.AR = STANDALONE_TCHAIN_ROOT + 'bin/arm-linux-androideabi-ar'
Config.GENDIR = "gen-android"
Config.USE_SYSTEM_PROTOBUF = False
-Config.PROTOC_BINARY = 'gen/third_party/protoc'
+Config.PROTOC_BINARY = 'gen/third_party/google/protoc'
Config.USE_LITE_PROTO = False
Config.USE_SYSTEM_EIGEN = False
Config.USE_GLOG = False
diff --git a/build_android_prepare.py b/build_android_prepare.py
index 01975eb..ed306cf 100644
--- a/build_android_prepare.py
+++ b/build_android_prepare.py
@@ -13,7 +13,7 @@
Brewery.Run(
Config,
['build_android_prepare.py',
- 'build', '//third_party:protoc'])
+ 'build', '//third_party/google:protoc'])
else:
print('This script is not intended to be used as an imported module.')
sys.exit(1)
diff --git a/caffe2/binaries/convert_caffe_image_db.cc b/caffe2/binaries/convert_caffe_image_db.cc
index e1ac8db..e0e207d 100644
--- a/caffe2/binaries/convert_caffe_image_db.cc
+++ b/caffe2/binaries/convert_caffe_image_db.cc
@@ -28,7 +28,7 @@
int count = 0;
for (; cursor->Valid(); cursor->Next()) {
caffe::Datum datum;
- CHECK(datum.ParseFromString(cursor->value()));
+ CAFFE_ENFORCE(datum.ParseFromString(cursor->value()));
TensorProtos protos;
TensorProto* data = protos.add_protos();
TensorProto* label = protos.add_protos();
diff --git a/caffe2/binaries/convert_encoded_to_raw_leveldb.cc b/caffe2/binaries/convert_encoded_to_raw_leveldb.cc
index 9a70638..aefc81f 100644
--- a/caffe2/binaries/convert_encoded_to_raw_leveldb.cc
+++ b/caffe2/binaries/convert_encoded_to_raw_leveldb.cc
@@ -45,7 +45,7 @@
leveldb::DB* db_temp;
leveldb::Status status = leveldb::DB::Open(
options, input_db_name, &db_temp);
- CHECK(status.ok()) << "Failed to open leveldb " << input_db_name << ".";
+ CAFFE_ENFORCE(status.ok(), "Failed to open leveldb ", input_db_name, ".");
input_db.reset(db_temp);
}
@@ -61,8 +61,11 @@
leveldb::DB* db_temp;
leveldb::Status status = leveldb::DB::Open(
options, output_db_name, &db_temp);
- CHECK(status.ok()) << "Failed to open leveldb " << output_db_name
- << ". Is it already existing?";
+ CAFFE_ENFORCE(
+ status.ok(),
+ "Failed to open leveldb ",
+ output_db_name,
+ ". Is it already existing?");
output_db.reset(db_temp);
}
batch.reset(new leveldb::WriteBatch());
@@ -84,7 +87,7 @@
iter->SeekToFirst();
int count = 0;
for (; iter->Valid(); iter->Next()) {
- CHECK(input_protos.ParseFromString(iter->value().ToString()));
+ CAFFE_ENFORCE(input_protos.ParseFromString(iter->value().ToString()));
label->CopyFrom(input_protos.protos(1));
const string& encoded_image = input_protos.protos(0).string_data(0);
int encoded_size = encoded_image.size();
diff --git a/caffe2/binaries/fb_run_plan_mpi.cc b/caffe2/binaries/fb_run_plan_mpi.cc
index 204501e..822071a 100644
--- a/caffe2/binaries/fb_run_plan_mpi.cc
+++ b/caffe2/binaries/fb_run_plan_mpi.cc
@@ -62,7 +62,7 @@
}
caffe2::PlanDef plan_def;
- CHECK(ReadProtoFromFile(caffe2::FLAGS_plan, &plan_def));
+ CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_plan, &plan_def));
std::unique_ptr<caffe2::Workspace> workspace(new caffe2::Workspace());
workspace->RunPlan(plan_def);
diff --git a/caffe2/binaries/inspect_gpus.cc b/caffe2/binaries/inspect_gpus.cc
index 4c58071..1a9cdc2 100644
--- a/caffe2/binaries/inspect_gpus.cc
+++ b/caffe2/binaries/inspect_gpus.cc
@@ -26,7 +26,7 @@
}
vector<vector<bool> > access_pattern;
- CHECK(caffe2::GetCudaPeerAccessPattern(&access_pattern));
+ CAFFE_ENFORCE(caffe2::GetCudaPeerAccessPattern(&access_pattern));
std::stringstream sstream;
// Find topology
diff --git a/caffe2/binaries/make_cifar_db.cc b/caffe2/binaries/make_cifar_db.cc
index 15eb9ef..1c5ffd1 100644
--- a/caffe2/binaries/make_cifar_db.cc
+++ b/caffe2/binaries/make_cifar_db.cc
@@ -76,7 +76,7 @@
LOG(INFO) << "Converting file " << filename;
std::ifstream data_file(filename.c_str(),
std::ios::in | std::ios::binary);
- CHECK(data_file) << "Unable to open file " << filename;
+ CAFFE_ENFORCE(data_file, "Unable to open file ", filename);
char str_buffer[kCIFARImageNBytes];
int label_value;
string serialized_protos;
diff --git a/caffe2/binaries/make_mnist_db.cc b/caffe2/binaries/make_mnist_db.cc
index 4d26c2b..7086ff6 100644
--- a/caffe2/binaries/make_mnist_db.cc
+++ b/caffe2/binaries/make_mnist_db.cc
@@ -32,8 +32,8 @@
// Open files
std::ifstream image_file(image_filename, std::ios::in | std::ios::binary);
std::ifstream label_file(label_filename, std::ios::in | std::ios::binary);
- CHECK(image_file) << "Unable to open file " << image_filename;
- CHECK(label_file) << "Unable to open file " << label_filename;
+ CAFFE_ENFORCE(image_file, "Unable to open file ", image_filename);
+ CAFFE_ENFORCE(label_file, "Unable to open file ", label_filename);
// Read the magic and the meta data
uint32_t magic;
uint32_t num_items;
diff --git a/caffe2/binaries/predictor_verifier.cc b/caffe2/binaries/predictor_verifier.cc
index feb39a3..56d9cd0 100644
--- a/caffe2/binaries/predictor_verifier.cc
+++ b/caffe2/binaries/predictor_verifier.cc
@@ -19,8 +19,8 @@
LOG(FATAL) << "No predict net specified. Use --predict_net=/path/to/net.";
}
caffe2::NetDef init_net, predict_net;
- CHECK(ReadProtoFromFile(FLAGS_init_net, &init_net));
- CHECK(ReadProtoFromFile(FLAGS_predict_net, &predict_net));
+ CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_init_net, &init_net));
+ CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_predict_net, &predict_net));
// Can be large due to constant fills
VLOG(1) << "Init net: " << ProtoDebugString(init_net);
LOG(INFO) << "Predict net: " << ProtoDebugString(predict_net);
diff --git a/caffe2/binaries/run_plan.cc b/caffe2/binaries/run_plan.cc
index a6b3e5c..7bc4c64 100644
--- a/caffe2/binaries/run_plan.cc
+++ b/caffe2/binaries/run_plan.cc
@@ -14,7 +14,7 @@
}
LOG(INFO) << "Loading plan: " << caffe2::FLAGS_plan;
caffe2::PlanDef plan_def;
- CHECK(ReadProtoFromFile(caffe2::FLAGS_plan, &plan_def));
+ CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_plan, &plan_def));
std::unique_ptr<caffe2::Workspace> workspace(new caffe2::Workspace());
workspace->RunPlan(plan_def);
diff --git a/caffe2/binaries/run_plan_mpi.cc b/caffe2/binaries/run_plan_mpi.cc
index a439d3d..dda9a8f 100644
--- a/caffe2/binaries/run_plan_mpi.cc
+++ b/caffe2/binaries/run_plan_mpi.cc
@@ -21,7 +21,7 @@
caffe2::GlobalInit(&argc, &argv);
LOG(INFO) << "Loading plan: " << caffe2::FLAGS_plan;
caffe2::PlanDef plan_def;
- CHECK(ReadProtoFromFile(caffe2::FLAGS_plan, &plan_def));
+ CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_plan, &plan_def));
std::unique_ptr<caffe2::Workspace> workspace(new caffe2::Workspace());
workspace->RunPlan(plan_def);
diff --git a/caffe2/binaries/speed_benchmark.cc b/caffe2/binaries/speed_benchmark.cc
index c794aa8..5f58169 100644
--- a/caffe2/binaries/speed_benchmark.cc
+++ b/caffe2/binaries/speed_benchmark.cc
@@ -16,12 +16,12 @@
std::unique_ptr<caffe2::Workspace> workspace(new caffe2::Workspace());
// Run initialization network.
caffe2::NetDef net_def;
- CHECK(ReadProtoFromFile(caffe2::FLAGS_init_net, &net_def));
- CHECK(workspace->RunNetOnce(net_def));
- CHECK(ReadProtoFromFile(caffe2::FLAGS_net, &net_def));
+ CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_init_net, &net_def));
+ CAFFE_ENFORCE(workspace->RunNetOnce(net_def));
+ CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_net, &net_def));
caffe2::NetBase* net = workspace->CreateNet(net_def);
CHECK_NOTNULL(net);
- CHECK(net->Run());
+ CAFFE_ENFORCE(net->Run());
net->TEST_Benchmark(caffe2::FLAGS_warmup, caffe2::FLAGS_iter, caffe2::FLAGS_run_individual);
return 0;
}
diff --git a/caffe2/binaries/zmq_feeder.cc b/caffe2/binaries/zmq_feeder.cc
index d2fcfb6..fc44657 100644
--- a/caffe2/binaries/zmq_feeder.cc
+++ b/caffe2/binaries/zmq_feeder.cc
@@ -23,8 +23,10 @@
LOG(INFO) << "Opening DB...";
auto in_db = caffe2::db::CreateDB(
caffe2::FLAGS_input_db_type, caffe2::FLAGS_input_db, caffe2::db::READ);
- CHECK(in_db) << "Cannot load input db " << caffe2::FLAGS_input_db
- << " of expected type " << caffe2::FLAGS_input_db_type;
+ CAFFE_ENFORCE(
+ in_db,
+ "Cannot load input db " + caffe2::FLAGS_input_db + " of expected type " +
+ caffe2::FLAGS_input_db_type);
auto cursor = in_db->NewCursor();
LOG(INFO) << "DB opened.";
diff --git a/caffe2/contrib/nervana/nervana_init_gpu.cc b/caffe2/contrib/nervana/nervana_init_gpu.cc
index 81dfa20..994fc97 100644
--- a/caffe2/contrib/nervana/nervana_init_gpu.cc
+++ b/caffe2/contrib/nervana/nervana_init_gpu.cc
@@ -29,10 +29,10 @@
VLOG(1) << "Loaded nervana kernels from path "
<< FLAGS_nervana_cubin_path;
} else {
- // Since this is not a critical error we will just log it in info.
- LOG(INFO) << "Cannot load nervana gpu kernels from path "
- << FLAGS_nervana_cubin_path
- << ", will disable Caffe2 nervana engines.";
+ // Since this is not a critical error we will just vlog it.
+ VLOG(1) << "Cannot load nervana gpu kernels from path "
+ << FLAGS_nervana_cubin_path
+ << ", will disable Caffe2 nervana engines.";
}
// We will always return true for this initialization, because the loading
// result is kept and accessible via NervanaKernelLoaded(). This allows us
diff --git a/caffe2/contrib/nervana/nervana_math_gpu.cc b/caffe2/contrib/nervana/nervana_math_gpu.cc
index d23cbcb..f3010b9 100644
--- a/caffe2/contrib/nervana/nervana_math_gpu.cc
+++ b/caffe2/contrib/nervana/nervana_math_gpu.cc
@@ -21,9 +21,24 @@
int ldb = (TransB == CblasNoTrans) ? N : K;
bool a_t = (TransA == CblasTrans);
bool b_t = (TransB == CblasTrans);
- CHECK(nervana_sgemm(
- const_cast<float*>(A), const_cast<float*>(B), C, a_t, b_t, M, N, K,
- lda, ldb, N, alpha, beta, nullptr, false, false, context->cuda_stream()));
+ CAFFE_ENFORCE(nervana_sgemm(
+ const_cast<float*>(A),
+ const_cast<float*>(B),
+ C,
+ a_t,
+ b_t,
+ M,
+ N,
+ K,
+ lda,
+ ldb,
+ N,
+ alpha,
+ beta,
+ nullptr,
+ false,
+ false,
+ context->cuda_stream()));
}
} // namespace math
diff --git a/caffe2/contrib/nnpack/nnpack_ops.cc b/caffe2/contrib/nnpack/nnpack_ops.cc
index 374015d..d3931eb 100644
--- a/caffe2/contrib/nnpack/nnpack_ops.cc
+++ b/caffe2/contrib/nnpack/nnpack_ops.cc
@@ -72,10 +72,10 @@
OperatorBase::GetSingleArgument<std::string>("algo", "AUTO"))),
kts_(get_nnp_convolution_transform_strategy(
OperatorBase::GetSingleArgument<std::string>("kts", "TUPLE"))) {
- CAFFE_ENFORCE(
+ OPERATOR_NEEDS_FEATURE(
this->order_ == StorageOrder::NCHW,
- "NNPack only supports NCHW order. Please consider add \
- TransposeOp with axes=[0, 3, 1, 2] before NNPack Conv.");
+ "NNPack only supports NCHW order. Please consider adding "
+ "TransposeOp with axes=[0, 3, 1, 2] before NNPack Conv.");
}
bool RunOnDeviceWithOrderNCHW() override;
@@ -176,28 +176,28 @@
public:
NNPACKMaxPoolOp(const OperatorDef& operator_def, Workspace* ws)
: ConvPoolOpBase<CPUContext>(operator_def, ws) {
- CAFFE_ENFORCE(
+ OPERATOR_NEEDS_FEATURE(
this->order_ == StorageOrder::NCHW,
- "NNPack only supports NCHW order. Please consider add \
- TransposeOp with axes=[0, 3, 1, 2] before NNPack Conv.");
- CAFFE_ENFORCE(
+ "NNPack only supports NCHW order. Please consider add "
+ "TransposeOp with axes=[0, 3, 1, 2] before NNPack Conv.");
+ OPERATOR_NEEDS_FEATURE(
this->kernel_h_ == 2, "NNPack only supports MaxPool kernel size 2*2!");
- CAFFE_ENFORCE(
+ OPERATOR_NEEDS_FEATURE(
this->kernel_w_ == 2, "NNPack only supports MaxPool kernel size 2*2!");
- CAFFE_ENFORCE(
+ OPERATOR_NEEDS_FEATURE(
this->stride_h_ == 2, "NNPack only supports MaxPool stride size 2*2!");
- CAFFE_ENFORCE(
+ OPERATOR_NEEDS_FEATURE(
this->stride_w_ == 2, "NNPack only supports MaxPool stride size 2*2!");
- CAFFE_ENFORCE(
+ OPERATOR_NEEDS_FEATURE(
this->pad_t_ == 0,
"NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
- CAFFE_ENFORCE(
+ OPERATOR_NEEDS_FEATURE(
this->pad_l_ == 0,
"NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
- CAFFE_ENFORCE(
+ OPERATOR_NEEDS_FEATURE(
this->pad_r_ == 0,
"NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
- CAFFE_ENFORCE(
+ OPERATOR_NEEDS_FEATURE(
this->pad_b_ == 0,
"NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
}
diff --git a/caffe2/contrib/nnpack/nnpack_ops_test.py b/caffe2/contrib/nnpack/nnpack_ops_test.py
index 5205f3b..5316c4b 100644
--- a/caffe2/contrib/nnpack/nnpack_ops_test.py
+++ b/caffe2/contrib/nnpack/nnpack_ops_test.py
@@ -9,9 +9,12 @@
import numpy as np
import time
import os
-from caffe2.python import core
+from caffe2.python import core, dyndep
import caffe2.python.hypothesis_test_util as hu
+
+dyndep.InitOpsLibrary("@/caffe2/caffe2/contrib/nnpack:nnpack_ops")
+
np.random.seed(1)
diff --git a/caffe2/contrib/torch/torch_op.h b/caffe2/contrib/torch/torch_op.h
index 037b4db..4669392 100644
--- a/caffe2/contrib/torch/torch_op.h
+++ b/caffe2/contrib/torch/torch_op.h
@@ -49,9 +49,10 @@
}
static const char* tensorTy(const Blob& blob) {
- CHECK(blob.template IsType<Tensor<Context>>());
+ CAFFE_ENFORCE(blob.template IsType<Tensor<Context>>());
const auto& tc = blob.template Get<Tensor<Context>>();
- CHECK(tc.template IsType<float>()) << tc.meta().name() << ", " << tc.size();
+ CAFFE_ENFORCE(
+ tc.template IsType<float>() + tc.meta().name(), ", ", tc.size());
return Traits::tensorTy;
}
@@ -141,7 +142,7 @@
auto* thDst = static_cast<typename Traits::Tensor*>(torchDst);
auto* tcDst = dst->template GetMutable<Tensor<Context>>();
CHECK_NOTNULL(src->storage->data);
- CHECK(src->storage->size);
+ CAFFE_ENFORCE(src->storage->size);
CHECK_EQ(src->storage->data, thDst->storage->data);
CHECK_EQ(src->storage->data, tcDst->template data<float>());
CHECK_EQ(src->storage->size, thDst->storage->size);
@@ -162,10 +163,10 @@
return;
}
- CHECK(lua_istable(L(), -1));
+ CAFFE_ENFORCE(lua_istable(L(), -1));
lua_pushnil(L());
for (auto i = 0; i < blobs.size(); ++i) {
- CHECK(lua_next(L(), -2));
+ CAFFE_ENFORCE(lua_next(L(), -2));
verifyOutput(blobs[i], tensors[i]);
lua_pop(L(), 1);
}
@@ -264,7 +265,8 @@
lua_pushnil(L);
int i = 0;
while (lua_next(L, -3) && i < paramBlobs.size()) {
- CHECK(luaT_isudata(L, -1, torch::Torch<Context>::Traits::tensorTy));
+ CAFFE_ENFORCE(
+ luaT_isudata(L, -1, torch::Torch<Context>::Traits::tensorTy));
auto* param =
static_cast<typename torch::Torch<Context>::Traits::Tensor*>(
luaT_toudata(L, -1, torch::Torch<Context>::Traits::tensorTy));
@@ -275,7 +277,7 @@
tc->Resize(paramShape);
tc->template mutable_data<float>();
} else {
- CHECK(tc->dims() == paramShape);
+ CAFFE_ENFORCE(tc->dims() == paramShape);
}
lua_pop(L, 1);
i++;
@@ -286,7 +288,8 @@
lua_getfield(L, -1, "output");
if (outputBlobs.size() == 0) {
} else if (outputBlobs.size() == 1) {
- CHECK(luaT_isudata(L, -1, torch::Torch<Context>::Traits::tensorTy));
+ CAFFE_ENFORCE(
+ luaT_isudata(L, -1, torch::Torch<Context>::Traits::tensorTy));
auto* output =
static_cast<typename torch::Torch<Context>::Traits::Tensor*>(
luaT_toudata(L, -1, torch::Torch<Context>::Traits::tensorTy));
@@ -299,7 +302,8 @@
lua_pushnil(L);
auto i = 0;
while (lua_next(L, -2) && i < outputBlobs.size()) {
- CHECK(luaT_isudata(L, -1, torch::Torch<Context>::Traits::tensorTy));
+ CAFFE_ENFORCE(
+ luaT_isudata(L, -1, torch::Torch<Context>::Traits::tensorTy));
auto* output =
static_cast<typename torch::Torch<Context>::Traits::Tensor*>(
luaT_toudata(L, -1, torch::Torch<Context>::Traits::tensorTy));
@@ -310,7 +314,7 @@
tc->Resize(outputShape);
tc->template mutable_data<float>();
} else {
- CHECK(tc->dims() == outputShape);
+ CAFFE_ENFORCE(tc->dims() == outputShape);
}
++i;
}
@@ -385,8 +389,9 @@
lua_pushnil(L);
auto i = 0;
while (lua_next(L, -2) && i < numParams) {
- CHECK(luaT_isudata(L, -1, state_.tensorTy(*paramBlobs[i])))
- << luaT_typename(L, -1);
+ CAFFE_ENFORCE(
+ luaT_isudata(L, -1, state_.tensorTy(*paramBlobs[i])),
+ luaT_typename(L, -1));
auto* udata = luaT_toudata(L, -1, state_.tensorTy(*paramBlobs[i]));
state_.setTensor(
static_cast<typename torch::Torch<Context>::Traits::Tensor*>(udata),
@@ -517,7 +522,7 @@
lua_pushnil(L);
auto i = 0;
while (lua_next(L, -3) && i < numParams) {
- CHECK(luaT_isudata(L, -1, state_.tensorTy(*paramBlobs[i])));
+ CAFFE_ENFORCE(luaT_isudata(L, -1, state_.tensorTy(*paramBlobs[i])));
auto* udata = luaT_toudata(L, -1, state_.tensorTy(*paramBlobs[i]));
state_.setTensor(
static_cast<typename torch::Torch<Context>::Traits::Tensor*>(udata),
@@ -530,7 +535,7 @@
lua_pushnil(L);
i = 0;
while (lua_next(L, -2) && i < numParams) {
- CHECK(luaT_isudata(L, -1, state_.tensorTy(*gradParamBlobs[i])));
+ CAFFE_ENFORCE(luaT_isudata(L, -1, state_.tensorTy(*gradParamBlobs[i])));
auto* udata = luaT_toudata(L, -1, state_.tensorTy(*gradParamBlobs[i]));
state_.setTensor(
static_cast<typename torch::Torch<Context>::Traits::Tensor*>(udata),
diff --git a/caffe2/contrib/torch/torch_op_gpu.cpp b/caffe2/contrib/torch/torch_op_gpu.cpp
index 9d64365..14f2ac8 100644
--- a/caffe2/contrib/torch/torch_op_gpu.cpp
+++ b/caffe2/contrib/torch/torch_op_gpu.cpp
@@ -29,9 +29,9 @@
THCState* cudaState(Torch<CUDAContext>* t) {
auto* L = t->L();
lua_getglobal(L, "cutorch");
- CHECK(!lua_isnil(L, -1));
+ CAFFE_ENFORCE(!lua_isnil(L, -1));
lua_getfield(L, -1, "_state");
- CHECK(!lua_isnil(L, -1));
+ CAFFE_ENFORCE(!lua_isnil(L, -1));
THCState* state = reinterpret_cast<THCState*>(lua_touserdata(L, -1));
lua_pop(L, 2);
return state;
diff --git a/caffe2/core/blob.h b/caffe2/core/blob.h
index a7223ae..2324286 100644
--- a/caffe2/core/blob.h
+++ b/caffe2/core/blob.h
@@ -152,7 +152,8 @@
*/
void Serialize(
const string& name,
- BlobSerializerBase::SerializationAcceptor acceptor) const;
+ BlobSerializerBase::SerializationAcceptor acceptor,
+ int chunk_size = -1) const;
/**
* @brief Convenience function to serialize a blob to a string.
diff --git a/caffe2/core/blob_gpu_test.cc b/caffe2/core/blob_gpu_test.cc
index 0eba012..5d8d967 100644
--- a/caffe2/core/blob_gpu_test.cc
+++ b/caffe2/core/blob_gpu_test.cc
@@ -134,7 +134,7 @@
blob.GetMutable<TensorCUDA>()->CopyFrom(cpu_tensor); \
string serialized = blob.Serialize("test"); \
BlobProto proto; \
- CHECK(proto.ParseFromString(serialized)); \
+ CAFFE_ENFORCE(proto.ParseFromString(serialized)); \
EXPECT_EQ(proto.name(), "test"); \
EXPECT_EQ(proto.type(), "Tensor"); \
EXPECT_TRUE(proto.has_tensor()); \
@@ -183,7 +183,7 @@
blob.Reset(new TensorCUDA(tensor, &context));
string serialized = blob.Serialize("test");
BlobProto proto;
- CHECK(proto.ParseFromString(serialized));
+ CAFFE_ENFORCE(proto.ParseFromString(serialized));
EXPECT_EQ(proto.name(), "test");
EXPECT_TRUE(proto.has_tensor());
const TensorProto& tensor_proto = proto.tensor();
diff --git a/caffe2/core/blob_serialization.cc b/caffe2/core/blob_serialization.cc
index 72cd57e..c99effc 100644
--- a/caffe2/core/blob_serialization.cc
+++ b/caffe2/core/blob_serialization.cc
@@ -30,7 +30,7 @@
const Blob& blob,
const string& name,
SerializationAcceptor acceptor) override {
- CHECK(blob.IsType<std::string>());
+ CAFFE_ENFORCE(blob.IsType<std::string>());
BlobProto blob_proto;
blob_proto.set_name(name);
@@ -72,10 +72,11 @@
// The blob serialization member function implementation.
void Blob::Serialize(
const string& name,
- BlobSerializerBase::SerializationAcceptor acceptor) const {
+ BlobSerializerBase::SerializationAcceptor acceptor,
+ int chunk_size) const {
std::unique_ptr<BlobSerializerBase> serializer(CreateSerializer(meta_.id()));
CAFFE_ENFORCE(serializer, "No known serializer for ", meta_.name());
- serializer->Serialize(*this, name, acceptor);
+ serializer->SerializeWithChunkSize(*this, name, acceptor, chunk_size);
}
// The blob serialization member function implementation.
diff --git a/caffe2/core/blob_serialization.h b/caffe2/core/blob_serialization.h
index 189e336..6a448c9 100644
--- a/caffe2/core/blob_serialization.h
+++ b/caffe2/core/blob_serialization.h
@@ -49,6 +49,12 @@
const Blob& blob,
const string& name,
SerializationAcceptor acceptor) override;
+ void SerializeWithChunkSize(
+ const Blob& blob,
+ const string& name,
+ SerializationAcceptor acceptor,
+ int chunk_size) override;
+
void Serialize(const Tensor<Context>& tensor, const string& name,
TensorProto* proto, size_t chunkBegin, int32_t chunkSize);
@@ -175,15 +181,26 @@
const Blob& blob,
const string& name,
BlobSerializerBase::SerializationAcceptor acceptor) {
- CHECK(blob.IsType<Tensor<Context>>());
+ this->SerializeWithChunkSize(
+ blob, name, acceptor, FLAGS_caffe2_tensor_chunk_size);
+}
+
+template <class Context>
+void TensorSerializer<Context>::SerializeWithChunkSize(
+ const Blob& blob,
+ const string& name,
+ BlobSerializerBase::SerializationAcceptor acceptor,
+ int chunk_size) {
+ CAFFE_ENFORCE(blob.IsType<Tensor<Context>>());
const auto& tensor = blob.template Get<Tensor<Context>>();
+ chunk_size = chunk_size == -1 ? FLAGS_caffe2_tensor_chunk_size : chunk_size;
#ifndef __ANDROID__
std::vector<std::future<void>> futures;
#endif
for (size_t chunkBegin = 0; chunkBegin < tensor.size();
- chunkBegin += FLAGS_caffe2_tensor_chunk_size) {
+ chunkBegin += chunk_size) {
auto task = [&](size_t chunkBegin) {
BlobProto blob_proto;
blob_proto.set_name(name);
@@ -191,15 +208,11 @@
TensorProto& proto = *blob_proto.mutable_tensor();
proto.set_name(name);
this->Serialize(
- tensor,
- name,
- blob_proto.mutable_tensor(),
- chunkBegin,
- FLAGS_caffe2_tensor_chunk_size);
+ tensor, name, blob_proto.mutable_tensor(), chunkBegin, chunk_size);
acceptor(name, blob_proto.SerializeAsString());
};
#ifndef __ANDROID__
- if (tensor.size() > FLAGS_caffe2_tensor_chunk_size) {
+ if (tensor.size() > chunk_size) {
futures.emplace_back(std::async(std::launch::async, task, chunkBegin));
} else {
// Sync mode for small tensors
@@ -224,11 +237,17 @@
const Tensor<Context>& input, const string& name,
TensorProto* proto_ptr, size_t chunkBegin, int32_t chunkSize) {
CAFFE_ENFORCE(
- chunkBegin < input.size(),
- "Chunk begin is out of tensor: ",
- chunkBegin,
- ' ',
- input.size());
+ chunkBegin < input.size(),
+ "Chunk begin is out of tensor: ",
+ chunkBegin,
+ ' ',
+ input.size());
+ CAFFE_ENFORCE(
+ input.raw_data(),
+ "The input does not have data input yet. This is probably because you "
+ "created a tensor of non-zero shape but never filled its data via "
+ "mutable_data() calls. This means that it makes no sense to serialize "
+ "the tensor content.");
if (chunkBegin + chunkSize > input.size()) {
chunkSize = input.size() - chunkBegin;
}
diff --git a/caffe2/core/blob_serializer_base.h b/caffe2/core/blob_serializer_base.h
index 1245dd9..c1d2802 100644
--- a/caffe2/core/blob_serializer_base.h
+++ b/caffe2/core/blob_serializer_base.h
@@ -38,6 +38,15 @@
*/
virtual void Serialize(const Blob& blob, const std::string& name,
SerializationAcceptor acceptor) = 0;
+
+ virtual void SerializeWithChunkSize(
+ const Blob& blob,
+ const std::string& name,
+ SerializationAcceptor acceptor,
+ int chunk_size) {
+ // Base implementation.
+ Serialize(blob, name, acceptor);
+ }
};
} // namespace caffe2
diff --git a/caffe2/core/blob_test.cc b/caffe2/core/blob_test.cc
index 8ec62da..18fae4f 100644
--- a/caffe2/core/blob_test.cc
+++ b/caffe2/core/blob_test.cc
@@ -25,6 +25,12 @@
class BlobTestFoo {};
class BlobTestBar {};
+}
+
+CAFFE_KNOWN_TYPE(BlobTestFoo);
+CAFFE_KNOWN_TYPE(BlobTestBar);
+
+namespace {
TEST(BlobTest, Blob) {
Blob blob;
@@ -260,10 +266,10 @@
}
}
-TYPED_TEST(TensorCPUDeathTest, CannotShareDataWhenShapeNotSet) {
+TYPED_TEST(TensorCPUTest, CannotShareDataWhenShapeNotSet) {
std::unique_ptr<TypeParam[]> raw_buffer(new TypeParam[10]);
TensorCPU tensor;
- EXPECT_DEATH(tensor.ShareExternalPointer(raw_buffer.get()), "");
+ ASSERT_THROW(tensor.ShareExternalPointer(raw_buffer.get()), EnforceNotMet);
}
TYPED_TEST(TensorCPUTest, TensorShareDataCanUseDifferentShapes) {
@@ -581,5 +587,29 @@
}
}
+TEST(CustomChunkSize, BigTensorSerialization) {
+ int64_t d1 = 2;
+ int64_t d2 = FLAGS_caffe2_test_big_tensor_size
+ ? FLAGS_caffe2_test_big_tensor_size / d1
+ : static_cast<int64_t>(std::numeric_limits<int>::max()) + 1;
+ int64_t size = d1 * d2;
+
+ Blob blob;
+ TensorCPU* tensor = blob.GetMutable<TensorCPU>();
+ tensor->Resize(d1, d2);
+ tensor->mutable_data<float>();
+ std::mutex mutex;
+ int counter = 0;
+ auto acceptor = [&](const std::string& key, const std::string& value) {
+ std::lock_guard<std::mutex> guard(mutex);
+ counter++;
+ };
+ blob.Serialize("test", acceptor, size);
+ EXPECT_EQ(counter, 1);
+
+ counter = 0;
+ blob.Serialize("test", acceptor, (size / 2) + 1);
+ EXPECT_EQ(counter, 2);
+}
} // namespace
} // namespace caffe2
diff --git a/caffe2/core/context_gpu.cu b/caffe2/core/context_gpu.cu
index 5e46c8d..90e28f8 100644
--- a/caffe2/core/context_gpu.cu
+++ b/caffe2/core/context_gpu.cu
@@ -8,6 +8,7 @@
#include "caffe2/core/context_gpu.h"
#include "caffe2/core/init.h"
#include "caffe2/core/logging.h"
+#include "caffe2/core/tensor.h"
#include "caffe2/utils/string_utils.h"
@@ -43,6 +44,8 @@
namespace caffe2 {
+CAFFE_KNOWN_TYPE(Tensor<CUDAContext>);
+
thread_local ThreadLocalCUDAObjects CUDAContext::cuda_objects_;
// Static global variables for setting up the memory pool.
diff --git a/caffe2/core/db.cc b/caffe2/core/db.cc
index 8ef5a51..9e59951 100644
--- a/caffe2/core/db.cc
+++ b/caffe2/core/db.cc
@@ -6,6 +6,10 @@
#include "caffe2/core/logging.h"
namespace caffe2 {
+
+CAFFE_KNOWN_TYPE(db::DBReader);
+CAFFE_KNOWN_TYPE(db::Cursor);
+
namespace db {
CAFFE_DEFINE_REGISTRY(Caffe2DBRegistry, DB, const string&, Mode);
@@ -30,7 +34,7 @@
void SeekToFirst() override {
fseek(file_, 0, SEEK_SET);
- CHECK(!feof(file_)) << "Hmm, empty file?";
+ CAFFE_ENFORCE(!feof(file_), "Hmm, empty file?");
// Read the first item.
valid_ = true;
Next();
@@ -64,12 +68,12 @@
}
string key() override {
- CHECK(valid_) << "Cursor is at invalid location!";
+ CAFFE_ENFORCE(valid_, "Cursor is at invalid location!");
return string(key_.data(), key_len_);
}
string value() override {
- CHECK(valid_) << "Cursor is at invalid location!";
+ CAFFE_ENFORCE(valid_, "Cursor is at invalid location!");
return string(value_.data(), value_len_);
}
@@ -133,7 +137,7 @@
file_ = fopen(source.c_str(), "rb");
break;
}
- CHECK(file_) << "Cannot open file: " << source;
+ CAFFE_ENFORCE(file_, "Cannot open file: " + source);
VLOG(1) << "Opened MiniDB " << source;
}
~MiniDB() { Close(); }
@@ -151,7 +155,7 @@
}
unique_ptr<Transaction> NewTransaction() override {
- CHECK(this->mode_ == NEW || this->mode_ == WRITE);
+ CAFFE_ENFORCE(this->mode_ == NEW || this->mode_ == WRITE);
return make_unique<MiniDBTransaction>(file_, &file_access_mutex_);
}
@@ -169,7 +173,7 @@
const Blob& blob,
const string& name,
BlobSerializerBase::SerializationAcceptor acceptor) {
- CHECK(blob.IsType<DBReader>());
+ CAFFE_ENFORCE(blob.IsType<DBReader>());
auto& reader = blob.Get<DBReader>();
DBReaderProto proto;
proto.set_name(name);
diff --git a/caffe2/core/db.h b/caffe2/core/db.h
index 530f2c3..9d92c89 100644
--- a/caffe2/core/db.h
+++ b/caffe2/core/db.h
@@ -130,8 +130,12 @@
friend class DBReaderSerializer;
DBReader() {}
- DBReader(const string& db_type, const string& source) {
- Open(db_type, source);
+ DBReader(
+ const string& db_type,
+ const string& source,
+ const int32_t num_shards = 1,
+ const int32_t shard_id = 0) {
+ Open(db_type, source, num_shards, shard_id);
}
explicit DBReader(const DBReaderProto& proto) {
@@ -142,6 +146,8 @@
"does not support it.");
cursor_->Seek(proto.key());
}
+ num_shards_ = 1;
+ shard_id_ = 0;
}
explicit DBReader(std::unique_ptr<DB> db)
@@ -152,7 +158,11 @@
cursor_ = db_->NewCursor();
}
- void Open(const string& db_type, const string& source) {
+ void Open(
+ const string& db_type,
+ const string& source,
+ const int32_t num_shards = 1,
+ const int32_t shard_id = 0) {
// Note(jiayq): resetting is needed when we re-open e.g. leveldb where no
// concurrent access is allowed.
cursor_.reset();
@@ -162,9 +172,16 @@
db_ = CreateDB(db_type_, source_, READ);
CAFFE_ENFORCE(db_,
"Cannot open db: ", source_, " of type ", db_type_);
+ CAFFE_ENFORCE(num_shards >= 1);
+ CAFFE_ENFORCE(shard_id >= 0);
+ CAFFE_ENFORCE(shard_id < num_shards);
+ num_shards_ = num_shards;
+ shard_id_ = shard_id;
cursor_ = db_->NewCursor();
+ SeekToFirst();
}
+ public:
/**
* Read a set of key and value from the db and move to next. Thread safe.
*
@@ -182,13 +199,18 @@
* output blob.
*/
void Read(string* key, string* value) const {
- CHECK(cursor_ != nullptr) << "Reader not initialized.";
+ CAFFE_ENFORCE(cursor_ != nullptr, "Reader not initialized.");
std::unique_lock<std::mutex> mutex_lock(reader_mutex_);
*key = cursor_->key();
*value = cursor_->value();
- cursor_->Next();
- if (!cursor_->Valid()) {
- cursor_->SeekToFirst();
+
+ // In sharded mode, each read skips num_shards_ records
+ for (int s = 0; s < num_shards_; s++) {
+ cursor_->Next();
+ if (!cursor_->Valid()) {
+ MoveToBeginning();
+ break;
+ }
}
}
@@ -196,9 +218,9 @@
* @brief Seeks to the first key. Thread safe.
*/
void SeekToFirst() const {
- CHECK(cursor_ != nullptr) << "Reader not initialized.";
+ CAFFE_ENFORCE(cursor_ != nullptr, "Reader not initialized.");
std::unique_lock<std::mutex> mutex_lock(reader_mutex_);
- cursor_->SeekToFirst();
+ MoveToBeginning();
}
/**
@@ -215,11 +237,24 @@
}
private:
+ void MoveToBeginning() const {
+ if (cursor_->SupportsSeek()) {
+ cursor_->SeekToFirst();
+ }
+ for (auto s = 0; s < shard_id_; s++) {
+ cursor_->Next();
+ CAFFE_ENFORCE(
+ cursor_->Valid(), "Db has less rows than shard id: ", s, shard_id_);
+ }
+ }
+
string db_type_;
string source_;
unique_ptr<DB> db_;
unique_ptr<Cursor> cursor_;
mutable std::mutex reader_mutex_;
+ uint32_t num_shards_;
+ uint32_t shard_id_;
DISABLE_COPY_AND_ASSIGN(DBReader);
};
diff --git a/caffe2/core/init.cc b/caffe2/core/init.cc
index a73f0c1..bac6766 100644
--- a/caffe2/core/init.cc
+++ b/caffe2/core/init.cc
@@ -5,12 +5,20 @@
#endif
namespace caffe2 {
+namespace internal {
+Caffe2InitializeRegistry* Caffe2InitializeRegistry::Registry() {
+ static Caffe2InitializeRegistry gRegistry;
+ return &gRegistry;
+}
+}
+
bool GlobalInit(int* pargc, char*** pargv) {
static bool global_init_was_already_run = false;
if (global_init_was_already_run) {
VLOG(1) << "GlobalInit has already been called: did you double-call?";
return true;
}
+ global_init_was_already_run = true;
bool success = true;
success &= internal::Caffe2InitializeRegistry::Registry()
->RunRegisteredEarlyInitFunctions(pargc, pargv);
@@ -23,9 +31,11 @@
// All other initialization functions.
success &= internal::Caffe2InitializeRegistry::Registry()
->RunRegisteredInitFunctions(pargc, pargv);
+ if (!success) {
+ global_init_was_already_run = false;
+ }
CAFFE_ENFORCE(success,
"Failed to run some init functions for caffe2.");
- global_init_was_already_run = true;
// TODO: if we fail GlobalInit(), should we continue?
return success;
}
diff --git a/caffe2/core/init.h b/caffe2/core/init.h
index ca7f979..b20866e 100644
--- a/caffe2/core/init.h
+++ b/caffe2/core/init.h
@@ -11,10 +11,9 @@
class Caffe2InitializeRegistry {
public:
typedef bool (*InitFunction)(int*, char***);
- static Caffe2InitializeRegistry* Registry() {
- static Caffe2InitializeRegistry gRegistry;
- return &gRegistry;
- }
+ // Registry() is defined in .cpp file to make registration work across
+ // multiple shared libraries loaded with RTLD_LOCAL
+ static Caffe2InitializeRegistry* Registry();
void Register(InitFunction function, bool run_early,
const char* description) {
diff --git a/caffe2/core/init_omp.cc b/caffe2/core/init_omp.cc
index 0f27c03..1b3be53 100644
--- a/caffe2/core/init_omp.cc
+++ b/caffe2/core/init_omp.cc
@@ -28,16 +28,15 @@
if (!getenv("OMP_NUM_THREADS")) {
// OMP_NUM_THREADS not passed explicitly, so *disable* OMP by
// default. The user can use the CLI flag to override.
- LOG(INFO) << "OMP_NUM_THREADS not passed, defaulting to 1 thread";
+ VLOG(1) << "OMP_NUM_THREADS not passed, defaulting to 1 thread";
omp_set_num_threads(1);
}
if (FLAGS_caffe2_omp_num_threads > 0) {
- LOG(INFO) << "Setting omp_num_threads to " << FLAGS_caffe2_omp_num_threads;
+ VLOG(1) << "Setting omp_num_threads to " << FLAGS_caffe2_omp_num_threads;
omp_set_num_threads(FLAGS_caffe2_omp_num_threads);
}
- LOG(INFO) << "Caffe2 running with " << omp_get_max_threads()
- << " OMP threads";
+ VLOG(1) << "Caffe2 running with " << omp_get_max_threads() << " OMP threads";
return true;
}
REGISTER_CAFFE2_INIT_FUNCTION(Caffe2SetOpenMPThreads,
@@ -48,24 +47,23 @@
#ifdef CAFFE2_USE_MKL
bool Caffe2SetMKLThreads(int*, char***) {
if (!getenv("MKL_NUM_THREADS")) {
- LOG(INFO) << "MKL_NUM_THREADS not passed, defaulting to 1 thread";
+ VLOG(1) << "MKL_NUM_THREADS not passed, defaulting to 1 thread";
mkl_set_num_threads(1);
}
// If caffe2_omp_num_threads is set, we use that for MKL as well.
if (FLAGS_caffe2_omp_num_threads > 0) {
- LOG(INFO) << "Setting mkl_num_threads to " << FLAGS_caffe2_omp_num_threads
- << " as inherited from omp_num_threads.";
+ VLOG(1) << "Setting mkl_num_threads to " << FLAGS_caffe2_omp_num_threads
+ << " as inherited from omp_num_threads.";
mkl_set_num_threads(FLAGS_caffe2_omp_num_threads);
}
// Override omp_num_threads if mkl_num_threads is set.
if (FLAGS_caffe2_mkl_num_threads > 0) {
- LOG(INFO) << "Setting mkl_num_threads to " << FLAGS_caffe2_mkl_num_threads;
+ VLOG(1) << "Setting mkl_num_threads to " << FLAGS_caffe2_mkl_num_threads;
mkl_set_num_threads(FLAGS_caffe2_mkl_num_threads);
}
- LOG(INFO) << "Caffe2 running with " << mkl_get_max_threads()
- << " MKL threads";
+ VLOG(1) << "Caffe2 running with " << mkl_get_max_threads() << " MKL threads";
return true;
}
REGISTER_CAFFE2_INIT_FUNCTION(
diff --git a/caffe2/core/logging.cc b/caffe2/core/logging.cc
index 84a6028..522d8b5 100644
--- a/caffe2/core/logging.cc
+++ b/caffe2/core/logging.cc
@@ -23,8 +23,8 @@
}
size_t ReplaceAll(string& s, const char* from, const char* to) {
- CHECK(from && *from);
- CHECK(to);
+ CAFFE_ENFORCE(from && *from);
+ CAFFE_ENFORCE(to);
size_t numReplaced = 0;
string::size_type lenFrom = std::strlen(from);
@@ -37,12 +37,19 @@
return numReplaced;
}
+static std::function<string(void)> FetchStackTrace = []() { return ""; };
+
+void SetStackTraceFetcher(std::function<string(void)> fetcher) {
+ FetchStackTrace = fetcher;
+}
+
EnforceNotMet::EnforceNotMet(
const char* file,
const int line,
const char* condition,
const string& msg)
: msg_stack_{MakeString(
+ FetchStackTrace(),
"[enforce fail at ",
StripBasename(std::string(file)),
":",
@@ -50,22 +57,27 @@
"] ",
condition,
". ",
- msg)} {
+ msg,
+ " ")} {
if (FLAGS_caffe2_use_fatal_for_enforce) {
LOG(FATAL) << msg_stack_[0];
- } else {
- LOG(ERROR) << msg_stack_[0];
}
+ full_msg_ = this->msg();
}
void EnforceNotMet::AppendMessage(const string& msg) {
- LOG(ERROR) << msg;
msg_stack_.push_back(msg);
+ full_msg_ = this->msg();
}
string EnforceNotMet::msg() const {
return std::accumulate(msg_stack_.begin(), msg_stack_.end(), string(""));
}
+
+const char* EnforceNotMet::what() const noexcept {
+ return full_msg_.c_str();
+}
+
} // namespace caffe2
diff --git a/caffe2/core/logging.h b/caffe2/core/logging.h
index d617e9a..d1b0ee8 100644
--- a/caffe2/core/logging.h
+++ b/caffe2/core/logging.h
@@ -3,6 +3,8 @@
#include <climits>
#include <exception>
+#include <functional>
+#include <limits>
#include <sstream>
#include "caffe2/core/flags.h"
@@ -75,6 +77,8 @@
// Returns number of replacements
size_t ReplaceAll(string& s, const char* from, const char* to);
+void SetStackTraceFetcher(std::function<string(void)> fetcher);
+
class EnforceNotMet : public std::exception {
public:
EnforceNotMet(
@@ -88,8 +92,11 @@
return msg_stack_;
}
+ const char* what() const noexcept override;
+
private:
vector<string> msg_stack_;
+ string full_msg_;
};
#define CAFFE_ENFORCE(condition, ...) \
@@ -104,6 +111,124 @@
throw ::caffe2::EnforceNotMet( \
__FILE__, __LINE__, "", ::caffe2::MakeString(__VA_ARGS__))
+/**
+ * Rich logging messages
+ *
+ * CAFFE_ENFORCE_THAT can be used with one of the "checker functions" that
+ * capture input argument values and add it to the exception message. E.g.
+ * `CAFFE_ENFORCE_THAT(Equals(foo(x), bar(y)), "Optional additional message")`
+ * would evaluate both foo and bar only once and if the results are not equal -
+ * include them in the exception message.
+ *
+ * Some of the basic checker functions like Equals or Greater are already
+ * defined below. Other header might define customized checkers by adding
+ * functions to caffe2::enforce_detail namespace. For example:
+ *
+ * namespace caffe2 { namespace enforce_detail {
+ * inline EnforceFailMessage IsVector(const vector<TIndex>& shape) {
+ * if (shape.size() == 1) { return EnforceOK(); }
+ * return MakeString("Shape ", shape, " is not a vector");
+ * }
+ * }}
+ *
+ * With further usages like `CAFFE_ENFORCE_THAT(IsVector(Input(0).dims()))`
+ *
+ * Convenient wrappers for binary operations like CAFFE_ENFORCE_EQ are provided
+ * too. Please use them instead of CHECK_EQ and friends for failures in
+ * user-provided input.
+ */
+
+namespace enforce_detail {
+
+struct EnforceOK {};
+
+class EnforceFailMessage {
+ public:
+ constexpr /* implicit */ EnforceFailMessage(EnforceOK) : msg_(nullptr) {}
+
+ EnforceFailMessage(EnforceFailMessage&&) = default;
+ EnforceFailMessage(const EnforceFailMessage&) = delete;
+ EnforceFailMessage& operator=(EnforceFailMessage&&) = delete;
+ EnforceFailMessage& operator=(const EnforceFailMessage&) = delete;
+
+ // Catch all wrong usages like CAFFE_ENFORCE_THAT(x < y)
+ template <class... Args>
+ /* implicit */ EnforceFailMessage(Args...) {
+ static_assert(
+ // This stands for an "impossible" condition. Plain `false` doesn't
+ // trick compiler enough.
+ sizeof...(Args) == std::numeric_limits<std::size_t>::max(),
+ "CAFFE_ENFORCE_THAT has to be used with one of special check functions "
+ "like `Equals`. Use CAFFE_ENFORCE for simple boolean checks.");
+ }
+
+ /* implicit */ EnforceFailMessage(std::string&& msg) {
+ msg_ = new std::string(std::move(msg));
+ }
+ inline bool bad() const {
+ return msg_;
+ }
+ std::string get_message_and_free(std::string&& extra) const {
+ std::string r;
+ if (extra.empty()) {
+ r = std::move(*msg_);
+ } else {
+ r = ::caffe2::MakeString(std::move(*msg_), ". ", std::move(extra));
+ }
+ delete msg_;
+ return r;
+ }
+
+ private:
+ std::string* msg_;
+};
+
+#define BINARY_COMP_HELPER(name, op) \
+ template <typename T1, typename T2> \
+ inline EnforceFailMessage name(const T1& x, const T2& y) { \
+ if (x op y) { \
+ return EnforceOK(); \
+ } \
+ return MakeString(x, " vs ", y); \
+ }
+BINARY_COMP_HELPER(Equals, ==)
+BINARY_COMP_HELPER(NotEquals, !=)
+BINARY_COMP_HELPER(Greater, >)
+BINARY_COMP_HELPER(GreaterEquals, >=)
+BINARY_COMP_HELPER(Less, <)
+BINARY_COMP_HELPER(LessEquals, <=)
+#undef BINARY_COMP_HELPER
+
+#define CAFFE_ENFORCE_THAT_IMPL(condition, expr, ...) \
+ do { \
+ using namespace ::caffe2::enforce_detail; \
+ const EnforceFailMessage& r = (condition); \
+ if (r.bad()) { \
+ throw ::caffe2::EnforceNotMet( \
+ __FILE__, \
+ __LINE__, \
+ expr, \
+ r.get_message_and_free(::caffe2::MakeString(__VA_ARGS__))); \
+ } \
+ } while (false)
+}
+
+#define CAFFE_ENFORCE_THAT(condition, ...) \
+ CAFFE_ENFORCE_THAT_IMPL((condition), #condition, __VA_ARGS__)
+
+#define CAFFE_ENFORCE_EQ(x, y, ...) \
+ CAFFE_ENFORCE_THAT_IMPL(Equals((x), (y)), #x " == " #y, __VA_ARGS__)
+#define CAFFE_ENFORCE_NE(x, y, ...) \
+ CAFFE_ENFORCE_THAT_IMPL(NotEquals((x), (y)), #x " != " #y, __VA_ARGS__)
+#define CAFFE_ENFORCE_LE(x, y, ...) \
+ CAFFE_ENFORCE_THAT_IMPL(LessEquals((x), (y)), #x " <= " #y, __VA_ARGS__)
+#define CAFFE_ENFORCE_LT(x, y, ...) \
+ CAFFE_ENFORCE_THAT_IMPL(Less((x), (y)), #x " < " #y, __VA_ARGS__)
+#define CAFFE_ENFORCE_GE(x, y, ...) \
+ CAFFE_ENFORCE_THAT_IMPL(GreaterEquals((x), (y)), #x " >= " #y, __VA_ARGS__)
+#define CAFFE_ENFORCE_GT(x, y, ...) \
+ CAFFE_ENFORCE_THAT_IMPL(Greater((x), (y)), #x " > " #y, __VA_ARGS__)
+
} // namespace caffe2
#endif // CAFFE2_CORE_LOGGING_H_
diff --git a/caffe2/core/logging_test.cc b/caffe2/core/logging_test.cc
index a8d494e..cce709e 100644
--- a/caffe2/core/logging_test.cc
+++ b/caffe2/core/logging_test.cc
@@ -17,16 +17,58 @@
CAFFE_ENFORCE(false, "This throws.");
// This should never be triggered.
EXPECT_FALSE(true);
- } catch (const EnforceNotMet& err) {}
+ } catch (const EnforceNotMet& err) {
+ }
std::swap(FLAGS_caffe2_use_fatal_for_enforce, kFalse);
}
+TEST(LoggingTest, TestEnforceEquals) {
+ int x = 4;
+ int y = 5;
+ try {
+ CAFFE_ENFORCE_THAT(Equals(++x, ++y));
+ // This should never be triggered.
+ EXPECT_FALSE(true);
+ } catch (const EnforceNotMet& err) {
+ EXPECT_NE(err.msg().find("5 vs 6"), string::npos);
+ }
+
+ // arguments are expanded only once
+ CAFFE_ENFORCE_THAT(Equals(++x, y));
+ EXPECT_EQ(x, 6);
+ EXPECT_EQ(y, 6);
+}
+
+TEST(LoggingTest, EnforceShowcase) {
+ // It's not really a test but rather a convenient thing that you can run and
+ // see all messages
+ int one = 1;
+ int two = 2;
+ int three = 3;
+#define WRAP_AND_PRINT(exp) \
+ try { \
+ exp; \
+ } catch (const EnforceNotMet& err) { \
+ /* EnforceNotMet already does LOG(ERROR) */ \
+ }
+ WRAP_AND_PRINT(CAFFE_ENFORCE_EQ(one, two));
+ WRAP_AND_PRINT(CAFFE_ENFORCE_NE(one * 2, two));
+ WRAP_AND_PRINT(CAFFE_ENFORCE_GT(one, two));
+ WRAP_AND_PRINT(CAFFE_ENFORCE_GE(one, two));
+ WRAP_AND_PRINT(CAFFE_ENFORCE_LT(three, two));
+ WRAP_AND_PRINT(CAFFE_ENFORCE_LE(three, two));
+
+ WRAP_AND_PRINT(CAFFE_ENFORCE_EQ(
+ one * two + three, three * two, "It's a pretty complicated expression"));
+
+ WRAP_AND_PRINT(CAFFE_ENFORCE_THAT(Equals(one * two + three, three * two)));
+}
+
TEST(LoggingDeathTest, TestEnforceUsingFatal) {
bool kTrue = true;
std::swap(FLAGS_caffe2_use_fatal_for_enforce, kTrue);
- EXPECT_DEATH(
- CAFFE_ENFORCE(false, "This goes fatal."), "");
+ EXPECT_DEATH(CAFFE_ENFORCE(false, "This goes fatal."), "");
std::swap(FLAGS_caffe2_use_fatal_for_enforce, kTrue);
}
-} // namespace caffe2
+} // namespace caffe2
diff --git a/caffe2/core/net.cc b/caffe2/core/net.cc
index 52534e7..fabfc45 100644
--- a/caffe2/core/net.cc
+++ b/caffe2/core/net.cc
@@ -1,6 +1,8 @@
#include "caffe2/core/net.h"
#include <set>
+#include <stack>
+#include <unordered_map>
#include <unordered_set>
#include "caffe2/core/operator.h"
@@ -9,7 +11,7 @@
CAFFE2_DEFINE_bool(
caffe2_disable_chaining,
- true,
+ false,
"Disable chaining logic (some latent multi-device issues).");
namespace caffe2 {
@@ -23,25 +25,6 @@
}
using OpIndex = int;
-using Ancestry = std::vector<std::unordered_set<OpIndex>>;
-Ancestry computeAncestors(
- const std::vector<internal::OperatorNode>& ops) {
- Ancestry ancestors;
- ancestors.resize(ops.size());
- for (auto i = 0; i < ops.size(); ++i) {
- const auto& parents = ops[i].parents_;
- for (const auto parent : parents) {
- ancestors[i].insert(parent);
- for (const auto parent_ancestor : ancestors[parent]) {
- ancestors[i].insert(parent_ancestor);
- }
- }
- VLOG(2) << "Ancestors of op: " << i << ", "
- << std::vector<OpIndex>(ancestors[i].begin(), ancestors[i].end());
- }
- return ancestors;
-}
-
DAGNetBase::ExecutionChains singleChains(
const std::vector<internal::OperatorNode>& nodes) {
DAGNetBase::ExecutionChains chains;
@@ -53,74 +36,136 @@
DAGNetBase::ExecutionChains computeChains(
const std::vector<internal::OperatorNode>& nodes) {
- const auto& ancestry = computeAncestors(nodes);
+ vector<int> initial_frontier;
+ for (int idx = 0; idx < nodes.size(); ++idx) {
+ if (nodes[idx].parents_.size() == 0) {
+ initial_frontier.push_back(idx);
+ }
+ }
+ // We need to construct the node_seen_count to know how many inner edges each
+ // node has.
+ std::unordered_map<OpIndex, int> node_seen_count;
+
+ for (int root_index : initial_frontier) {
+ const auto& root = nodes[root_index];
+ std::stack<std::pair<OpIndex, std::vector<int>::const_iterator>>
+ depth_stack;
+ depth_stack.push(make_pair(root_index, root.children_.begin()));
+ node_seen_count[root_index]++;
+ CAFFE_ENFORCE(
+ node_seen_count[root_index] == 1,
+ "root node ",
+ root_index,
+ " visit count must be == 1");
+
+ while (depth_stack.size() > 0) {
+ auto cur = depth_stack.top();
+ depth_stack.pop();
+ if (cur.second != nodes[cur.first].children_.end()) {
+ OpIndex node_index = *cur.second;
+ node_seen_count[node_index]++;
+ cur.second++;
+ depth_stack.push(cur);
+ if (node_seen_count[node_index] == 1) {
+ // Visit each child only once.
+ depth_stack.push(
+ make_pair(node_index, nodes[node_index].children_.begin()));
+ }
+ }
+ }
+ }
// Now, we compute the set of execution chains An execution chain is
// a linear set of nodes that can be executed on a single stream
// (e.g. a chain of single input, single output operators)
DAGNetBase::ExecutionChains chains;
std::unordered_set<OpIndex> seen_nodes;
- for (auto i = 0; i < nodes.size(); ++i) {
- if (seen_nodes.find(i) != seen_nodes.end()) {
- // We've already executed this operator.
- continue;
- }
- // Compute the execution chain rooted at this node.
- std::vector<OpIndex> chain;
- chain.push_back(i);
-
- while (true) {
- const auto current = chain.back();
- const auto& children = nodes[current].children_;
-
- // Find children for which this current node is the *single*
- // direct ancestor. If there are more than one, then we can't
- // chain.
- std::vector<OpIndex> candidates;
- for (const auto child : children) {
- std::vector<OpIndex> direct_parents;
- const auto& parents = nodes[child].parents_;
- for (const auto parent : parents) {
- if (std::all_of(
- parents.begin(), parents.end(), [&](OpIndex other_parent) {
- // If `other_parent` contains `parent` in it's
- // ancestors, we can ignore `parent`.
- return !ancestry.at(other_parent).count(parent);
- })) {
- direct_parents.push_back(parent);
- }
- }
- if (direct_parents.size() == 1 && direct_parents.front() == current) {
- candidates.push_back(child);
- }
- }
-
- if (candidates.size() != 1) {
- break;
- }
-
- const auto candidate = candidates.front();
- const auto parent = chain.back();
-
- if (!sameDevice(
- nodes[candidate].operator_->def(),
- nodes[parent].operator_->def())) {
- break;
- }
-
- chain.push_back(candidate);
- };
-
- for (const auto node : chain) {
+ std::vector<OpIndex> chain;
+ std::pair<OpIndex, std::vector<int>::const_iterator> cur;
+ std::stack<std::pair<OpIndex, std::vector<int>::const_iterator>> depth_stack;
+ auto check_current_for_chaining = [&]() -> bool {
+ return (
+ node_seen_count[cur.first] == 1 &&
+ (chain.size() == 0 || sameDevice(
+ nodes[cur.first].operator_->def(),
+ nodes[chain.back()].operator_->def())));
+ };
+ auto commit_chain = [&]() {
+ if (chain.size() > 0) {
CAFFE_ENFORCE(
- seen_nodes.insert(node).second,
- "Node ",
- node,
- " is already in the net.");
+ chains.insert({chain.front(), chain}).second,
+ "Chain ",
+ chain.front(),
+ " was already added.");
+ VLOG(2) << "Added chain: " << chain.front() << "with elements";
+ for (auto ch : chain) {
+ VLOG(2) << ch << ", ";
+ }
+ chain.clear();
}
- CAFFE_ENFORCE(
- chains.insert({i, chain}).second, "Chain ", i, " was already added.");
- VLOG(2) << "Added chain: " << chain;
+ };
+ auto depth_traverse = [&]() {
+ while (cur.second != nodes[cur.first].children_.end() &&
+ seen_nodes.find(*cur.second) != seen_nodes.end()) {
+ cur.second++;
+ }
+
+ if (cur.second != nodes[cur.first].children_.end()) {
+ auto next = make_pair(*cur.second, nodes[*cur.second].children_.begin());
+ depth_stack.push(cur);
+ depth_stack.push(next);
+ }
+ };
+ for (int root_index : initial_frontier) {
+ depth_stack.push(
+ make_pair(root_index, nodes[root_index].children_.begin()));
+ while (depth_stack.size() > 0) {
+ cur = depth_stack.top();
+ depth_stack.pop();
+ if (seen_nodes.find(cur.first) == seen_nodes.end()) {
+ seen_nodes.insert(cur.first);
+ // Has one child, can be candidate for chain or can be added to the
+ // previous chain.
+ if (nodes[cur.first].children_.size() == 1) {
+ if (check_current_for_chaining()) {
+ // Add oneself to the current chain.
+ VLOG(1) << "Adding to existing chain" << cur.first;
+ chain.push_back(cur.first);
+ int index = *nodes[cur.first].children_.begin();
+ depth_stack.push(make_pair(index, nodes[index].children_.begin()));
+ } else {
+ // Can't belong to the previous chain, commit previous chain and
+ // start a new one.
+ commit_chain();
+ chain.push_back(cur.first);
+ int index = *nodes[cur.first].children_.begin();
+ depth_stack.push(make_pair(index, nodes[index].children_.begin()));
+ }
+ } else if (
+ nodes[cur.first].children_.size() == 0 &&
+ check_current_for_chaining()) {
+ // Add current node to the current chain and commit.
+ chain.push_back(cur.first);
+ commit_chain();
+ } else {
+ // Node has more than one child.
+ commit_chain();
+ // Add current node as an independent chain since it won't be a part
+ // of a bigger chain.
+ chain.push_back(cur.first);
+ commit_chain();
+ depth_traverse();
+ }
+ } else {
+ // This node has been seen before, we will only traverse its children.
+ // Commit any pending chains and continue traversing.
+ commit_chain();
+ depth_traverse();
+ }
+ } // End while
+
+ // Check if this if is even needed.
+ commit_chain();
}
CAFFE_ENFORCE(
seen_nodes.size() == nodes.size(),
@@ -131,7 +176,6 @@
".");
return chains;
}
-
}
CAFFE_DEFINE_REGISTRY(NetRegistry, NetBase, const NetDef&, Workspace*);
@@ -150,12 +194,19 @@
for (const string& in : op.input()) {
if (!known_blobs.count(in)) {
if (external_input_.size()) {
- CAFFE_ENFORCE(false,
- "Source for input ", in, " is unknown.");
+ CAFFE_THROW(
+ "op ",
+ op.type(),
+ ": Source for input ",
+ in,
+ " is unknown for net ",
+ def.name(),
+ ", operator ",
+ ProtoDebugString(op));
} else {
// If we are not declaring input and output, we will simply VLOG it
// for debugging purposes.
- VLOG(1) << "Source for input " << in << " is unknown.";
+ VLOG(1) << "op " << op.type() << ": input " << in << " is unknown.";
}
}
}
@@ -168,7 +219,10 @@
CAFFE_ENFORCE(
remaining_output.size() == 0,
"Some of the blobs are declared as output but never produced by the "
- "net.");
+ "net ",
+ def.name(),
+ ", the first one is ",
+ *remaining_output.begin());
}
unique_ptr<NetBase> CreateNet(const NetDef& net_def, Workspace* ws) {
@@ -182,6 +236,7 @@
SimpleNet::SimpleNet(const NetDef& net_def, Workspace* ws)
: NetBase(net_def, ws) {
+ VLOG(1) << "Constructing SimpleNet " << net_def.name();
bool net_def_has_device_option = net_def.has_device_option();
// Initialize the operators
for (const OperatorDef& operator_def : net_def.op()) {
@@ -317,6 +372,7 @@
DAGNetBase::DAGNetBase(const NetDef& net_def, Workspace* ws)
: NetBase(net_def, ws), operator_nodes_(net_def.op_size()) {
// Blob creator allows us to track which operator created which blob.
+ VLOG(1) << "Constructing DAGNet " << net_def.name();
std::map<string, int> blob_creator;
std::map<string, std::set<int> > blob_readers;
bool net_def_has_device_option = net_def.has_device_option();
@@ -411,6 +467,9 @@
(FLAGS_caffe2_disable_chaining ? singleChains(operator_nodes_)
: computeChains(operator_nodes_));
+ LOG(INFO) << "Number of parallel execution chains "
+ << execution_chains_.size()
+ << " Number of operators = " << net_def.op_size();
// TODO: do we want to make sure that there are no loops in the
// dependency graph?
diff --git a/caffe2/core/net_gpu.cc b/caffe2/core/net_gpu.cc
index bd41dbb..fa3c6e3 100644
--- a/caffe2/core/net_gpu.cc
+++ b/caffe2/core/net_gpu.cc
@@ -163,6 +163,7 @@
class AsyncDAGNet : public DAGNetBase {
public:
AsyncDAGNet(const NetDef& net_def, Workspace* ws) : DAGNetBase(net_def, ws) {
+ VLOG(1) << "Constructing Async DAG Net " << net_def.name();
eventRecorded_.resize(net_def.op_size());
events_.reserve(net_def.op_size());
for (int idx = 0; idx < net_def.op_size(); ++idx) {
diff --git a/caffe2/core/net_test.cc b/caffe2/core/net_test.cc
index ae6f3a4..f9e7854 100644
--- a/caffe2/core/net_test.cc
+++ b/caffe2/core/net_test.cc
@@ -44,8 +44,8 @@
const vector<string>& input,
const vector<string>& output) {
NetDef net_def;
- CHECK(google::protobuf::TextFormat::ParseFromString(
- kExampleNetDefString, &net_def));
+ CAFFE_ENFORCE(google::protobuf::TextFormat::ParseFromString(
+ kExampleNetDefString, &net_def));
for (const auto& name : input) {
net_def.add_external_input(name);
}
@@ -105,7 +105,7 @@
Workspace ws;
ws.CreateBlob("in");
NetDef net_def;
- CHECK(google::protobuf::TextFormat::ParseFromString(spec, &net_def));
+ CAFFE_ENFORCE(google::protobuf::TextFormat::ParseFromString(spec, &net_def));
{
auto old = FLAGS_caffe2_disable_chaining;
auto g = MakeGuard([&]() { FLAGS_caffe2_disable_chaining = old; });
@@ -201,34 +201,34 @@
checkChaining(spec, {{0, {0}}, {1, {1}}, {2, {2}}});
}
-TEST(NetTest, ChainingForJoinWithAncestor) {
- const auto spec = R"DOC(
- name: "example"
- type: "dag"
- external_input: "in"
- op {
- input: "in"
- output: "hidden"
- type: "NetTestDummy"
- }
- op {
- input: "hidden"
- output: "out1"
- type: "NetTestDummy"
- }
- op {
- input: "hidden"
- output: "out2"
- type: "NetTestDummy"
- }
- op {
- input: "hidden"
- input: "out2"
- type: "NetTestDummy"
- }
-)DOC";
- checkChaining(spec, {{0, {0}}, {1, {1}}, {2, {2, 3}}});
-}
+// TEST(NetTest, ChainingForJoinWithAncestor) {
+// const auto spec = R"DOC(
+// name: "example"
+// type: "dag"
+// external_input: "in"
+// op {
+// input: "in"
+// output: "hidden"
+// type: "NetTestDummy"
+// }
+// op {
+// input: "hidden"
+// output: "out1"
+// type: "NetTestDummy"
+// }
+// op {
+// input: "hidden"
+// output: "out2"
+// type: "NetTestDummy"
+// }
+// op {
+// input: "hidden"
+// input: "out2"
+// type: "NetTestDummy"
+// }
+// )DOC";
+// checkChaining(spec, {{0, {0}}, {1, {1}}, {2, {2, 3}}});
+// }
TEST(NetTest, ChainingForForkJoin) {
const auto spec = R"DOC(
diff --git a/caffe2/core/operator.cc b/caffe2/core/operator.cc
index 79a2a97..387d87a 100644
--- a/caffe2/core/operator.cc
+++ b/caffe2/core/operator.cc
@@ -8,6 +8,7 @@
#include "caffe2/core/workspace.h"
#include "caffe2/proto/caffe2.pb.h"
#include "caffe2/utils/proto_utils.h"
+#include "caffe2/utils/string_utils.h"
namespace caffe2 {
@@ -16,8 +17,12 @@
: operator_def_(operator_def), arg_helper_(operator_def_) {
for (const string& input_str : operator_def_.input()) {
auto* blob = ws->GetBlob(input_str);
- CAFFE_ENFORCE(blob != nullptr,
- "Encountered a non-existing input blob: ", input_str);
+ CAFFE_ENFORCE(
+ blob != nullptr,
+ "op ",
+ operator_def_.type(),
+ ": Encountered a non-existing input blob: ",
+ input_str);
inputs_.push_back(blob);
}
for (const string& output_str : operator_def_.output()) {
@@ -28,16 +33,23 @@
namespace {
unique_ptr<OperatorBase> TryCreateOperator(
const string& key, const OperatorDef& operator_def, Workspace* ws) {
- switch (operator_def.device_option().device_type()) {
- case CPU:
- VLOG(1) << "Creating CPU operator " << key;
- return CPUOperatorRegistry()->Create(key, operator_def, ws);
- case CUDA:
- VLOG(1) << "Creating CUDA operator " << key;
- return CUDAOperatorRegistry()->Create(key, operator_def, ws);
- default:
- LOG(FATAL) << "Unknown device type: "
- << operator_def.device_option().device_type();
+ try {
+ switch (operator_def.device_option().device_type()) {
+ case CPU:
+ VLOG(1) << "Creating CPU operator " << key;
+ return CPUOperatorRegistry()->Create(key, operator_def, ws);
+ case CUDA:
+ VLOG(1) << "Creating CUDA operator " << key;
+ return CUDAOperatorRegistry()->Create(key, operator_def, ws);
+ default:
+ LOG(FATAL) << "Unknown device type: "
+ << operator_def.device_option().device_type();
+ return nullptr;
+ }
+ } catch (const UnsupportedOperatorFeature& err) {
+ VLOG(1) << "Operator " << operator_def.type()
+ << " with engine does not support the requested feature. Msg: "
+ << err.what() << ". Proto is: " << ProtoDebugString(operator_def);
return nullptr;
}
}
@@ -63,17 +75,21 @@
// Second, if the user has provided an engine, try create that engine
if (operator_def.engine().size()) {
- string key = operator_def.type() + "_ENGINE_" + operator_def.engine();
- VLOG(1) << "Trying to create operator " << operator_def.type()
- << " with engine " << operator_def.engine();
- auto op = TryCreateOperator(key, operator_def, ws);
- if (op) {
- return op;
+ vector<string> engine_choices = split(',', operator_def.engine());
+ for (const string& engine : engine_choices) {
+ string key = operator_def.type() + "_ENGINE_" + engine;
+ VLOG(1) << "Trying to create operator " << operator_def.type()
+ << " with engine " << engine;
+ auto op = TryCreateOperator(key, operator_def, ws);
+ if (op) {
+ return op;
+ } else {
+ // If the above fails, we will just return the normal case with the
+ // default implementation.
+ VLOG(1) << "Operator with engine " << engine
+ << " is not available. Using default implementation.";
+ }
}
- // If the above fails, we will just return the normal case with the default
- // implementation.
- VLOG(1) << "Operator with engine " << operator_def.engine()
- << " is not available. Using default implementation.";
}
// Lastly, if the engine does not work here, try using the default engine.
@@ -142,10 +158,11 @@
} else if (grad.IsDense()) {
VLOG(1) << "\t [dense]" << grad.dense_;
} else {
- CHECK(grad.indices_.size() && grad.values_.size())
- << "For sparse gradient, one should set both indices and values. "
- << "Currently we have: (" << grad.indices_ << ", " << grad.values_
- << ").";
+ CAFFE_ENFORCE(
+ grad.indices_.size() && grad.values_.size(),
+ "For sparse gradient, one should set both indices and values. "
+ "Currently we have: (" +
+ grad.indices_ + ", " + grad.values_ + ").");
VLOG(1) << "\t [sparse] " << grad.indices_ << ", " << grad.values_;
}
}
diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h
index 4df4e10..0f5fcac 100644
--- a/caffe2/core/operator.h
+++ b/caffe2/core/operator.h
@@ -3,6 +3,7 @@
#include <climits>
#include <cstddef>
+#include <exception>
#include <typeinfo>
#include <vector>
@@ -162,7 +163,7 @@
}
return (started && finished);
} catch (EnforceNotMet& err) {
- err.AppendMessage("Error from operator " + ProtoDebugString(def()));
+ err.AppendMessage("Error from operator: \n" + ProtoDebugString(def()));
throw;
}
}
@@ -172,7 +173,7 @@
context_.SwitchToDevice();
return RunOnDevice();
} catch (EnforceNotMet& err) {
- err.AppendMessage("Error from operator " + ProtoDebugString(def()));
+ err.AppendMessage("Error from operator: \n" + ProtoDebugString(def()));
throw;
}
}
@@ -339,6 +340,30 @@
#define REGISTER_CUDNN_OPERATOR(name, ...) \
REGISTER_CUDA_OPERATOR_WITH_ENGINE(name, CUDNN, __VA_ARGS__)
+// An exception that can be thrown by an operator constructor that notifies
+// that it does not support the given setting. This can be usually used for
+// specific engines that only implement a subset of the features required by
+// the original operator schema.
+// TODO(jiayq): make more feature-complete exception message.
+class UnsupportedOperatorFeature : public std::exception {
+ public:
+ UnsupportedOperatorFeature(const string& msg) : msg_(msg) {}
+ const char* what() const noexcept override {
+ return msg_.c_str();
+ }
+
+ private:
+ string msg_;
+};
+
+// A helper macro that should ONLY be used in the operator constructor to check
+// if needed features are met. If not, throws the UnsupportedOperatorFeature
+// exception with the given message.
+#define OPERATOR_NEEDS_FEATURE(condition, message) \
+ if (!(condition)) { \
+ throw UnsupportedOperatorFeature(message); \
+ }
+
// Creates an operator with the given operator definition.
unique_ptr<OperatorBase> CreateOperator(
const OperatorDef& operator_def, Workspace* ws);
diff --git a/caffe2/core/operator_schema_test.cc b/caffe2/core/operator_schema_test.cc
index 5da2bfe..7fbc95b 100644
--- a/caffe2/core/operator_schema_test.cc
+++ b/caffe2/core/operator_schema_test.cc
@@ -190,6 +190,10 @@
// deduces the
// schema from the "to" argument.
const OpSchema* schema = OpSchemaRegistry::Schema("Cast");
+ if (!schema) {
+ // Compiled without the Cast op.
+ return;
+ }
OperatorDef def = CreateOperatorDef(
"Cast",
"",
diff --git a/caffe2/core/operator_test.cc b/caffe2/core/operator_test.cc
index 98c2de1..bd875d6 100644
--- a/caffe2/core/operator_test.cc
+++ b/caffe2/core/operator_test.cc
@@ -13,6 +13,34 @@
public:
using OperatorBase::OperatorBase;
bool Run() override { return true; }
+ virtual string type() {
+ return "base";
+ }
+};
+
+class JustTestAndNeverConstructs : public JustTest {
+ public:
+ JustTestAndNeverConstructs(const OperatorDef& def, Workspace* ws)
+ : JustTest(def, ws) {
+ throw UnsupportedOperatorFeature("I just don't construct.");
+ }
+ bool Run() override {
+ return true;
+ }
+ string type() override {
+ return "FOO";
+ }
+};
+
+class JustTestAndDoesConstruct : public JustTest {
+ public:
+ using JustTest::JustTest;
+ bool Run() override {
+ return true;
+ }
+ string type() override {
+ return "BAR";
+ }
};
class ThrowException : public Operator<CPUContext> {
@@ -28,6 +56,8 @@
OPERATOR_SCHEMA(ThrowException).NumInputs(0).NumOutputs(0);
REGISTER_CPU_OPERATOR(JustTest, JustTest);
+REGISTER_CPU_OPERATOR_WITH_ENGINE(JustTest, FOO, JustTestAndNeverConstructs);
+REGISTER_CPU_OPERATOR_WITH_ENGINE(JustTest, BAR, JustTestAndDoesConstruct);
REGISTER_CUDA_OPERATOR(JustTest, JustTest);
REGISTER_CPU_OPERATOR(ThrowException, ThrowException);
@@ -65,6 +95,26 @@
}
}
+TEST(OperatorTest, FallbackIfEngineDoesNotBuild) {
+ OperatorDef op_def;
+ Workspace ws;
+ op_def.set_type("JustTest");
+ op_def.set_engine("FOO");
+ unique_ptr<OperatorBase> op = CreateOperator(op_def, &ws);
+ EXPECT_NE(nullptr, op.get());
+ EXPECT_EQ(static_cast<JustTest*>(op.get())->type(), "base");
+}
+
+TEST(OperatorTest, MultipleEngineChoices) {
+ OperatorDef op_def;
+ Workspace ws;
+ op_def.set_type("JustTest");
+ op_def.set_engine("FOO,BAR");
+ unique_ptr<OperatorBase> op = CreateOperator(op_def, &ws);
+ EXPECT_NE(nullptr, op.get());
+ EXPECT_EQ(static_cast<JustTest*>(op.get())->type(), "BAR");
+}
+
TEST(OperatorTest, CannotUseUninitializedBlob) {
Workspace ws;
OperatorDef op_def;
diff --git a/caffe2/core/parallel_net_test.cc b/caffe2/core/parallel_net_test.cc
index e218063..7332be1 100644
--- a/caffe2/core/parallel_net_test.cc
+++ b/caffe2/core/parallel_net_test.cc
@@ -89,14 +89,14 @@
// Run a network and get its duration in milliseconds.
int RunNetAndGetDuration(const string& net_def_str, const string& type) {
NetDef net_def;
- CHECK(google::protobuf::TextFormat::ParseFromString(
- net_def_str, &net_def));
+ CAFFE_ENFORCE(
+ google::protobuf::TextFormat::ParseFromString(net_def_str, &net_def));
net_def.set_type(type);
Workspace ws;
unique_ptr<NetBase> net(CreateNet(net_def, &ws));
- CHECK(net.get() != nullptr);
+ CAFFE_ENFORCE(net.get() != nullptr);
auto start_time = std::chrono::system_clock::now();
- CHECK(net->Run());
+ CAFFE_ENFORCE(net->Run());
// Inspect the time - it should be around 200 milliseconds, since sleep3 can
// run in parallel with sleep1 and sleep2.
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(
diff --git a/caffe2/core/predictor.cc b/caffe2/core/predictor.cc
index 7d7d2d1..47d9f08 100644
--- a/caffe2/core/predictor.cc
+++ b/caffe2/core/predictor.cc
@@ -31,8 +31,11 @@
}
}
-Predictor::Predictor(const NetDef& init_net, const NetDef& run_net)
- : run_net_(run_net) {
+Predictor::Predictor(
+ const NetDef& init_net,
+ const NetDef& run_net,
+ Workspace* parent)
+ : run_net_(run_net), ws_(parent) {
CAFFE_ENFORCE(ws_.RunNetOnce(init_net));
CAFFE_ENFORCE(ws_.CreateNet(run_net));
}
diff --git a/caffe2/core/predictor.h b/caffe2/core/predictor.h
index 687c133..7767ece 100644
--- a/caffe2/core/predictor.h
+++ b/caffe2/core/predictor.h
@@ -10,7 +10,10 @@
using TensorVector = std::vector<TensorCPU*>;
// Runs the `init_net` once, then saves the `run_net` to be executed
// in `::run`
- Predictor(const NetDef& init_net, const NetDef& run_net);
+ Predictor(
+ const NetDef& init_net,
+ const NetDef& run_net,
+ Workspace* parent = nullptr);
// Executes `run_net` on the inputs.
// The first `inputs.size()` inputs from run_net::external_inputs
diff --git a/caffe2/core/registry_test.cc b/caffe2/core/registry_test.cc
index 2f879c0c..6683d95 100644
--- a/caffe2/core/registry_test.cc
+++ b/caffe2/core/registry_test.cc
@@ -6,7 +6,8 @@
#include "caffe2/core/logging.h"
namespace caffe2 {
-namespace registry_test {
+namespace {
+
class Foo {
public:
explicit Foo(int x) { LOG(INFO) << "Foo " << x; }
@@ -41,8 +42,5 @@
TEST(RegistryTest, ReturnNullOnNonExistingCreator) {
EXPECT_EQ(FooRegistry()->Create("Non-existing bar", 1), nullptr);
}
-
-} // registry_test
+}
} // namespace caffe2
-
-
diff --git a/caffe2/core/tensor.cc b/caffe2/core/tensor.cc
index 7d0a358..4355a1e 100644
--- a/caffe2/core/tensor.cc
+++ b/caffe2/core/tensor.cc
@@ -5,3 +5,8 @@
caffe2_keep_on_shrink,
true,
"If set, keeps memory when a tensor is shrinking its size.");
+
+namespace caffe2 {
+// declaring it here instead of context.cc because tensor.h includes context.h
+CAFFE_KNOWN_TYPE(Tensor<CPUContext>);
+}
diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
index 6886152..b25336e 100644
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@@ -296,7 +296,7 @@
// It is possible that the source tensor hasn't called mutable_data() yet,
// in which case ShareData() doesn't make much sense since we don't really
// know what to share yet.
- CHECK(src.data_.get()) << "Source tensor has no content yet.";
+ CAFFE_ENFORCE(src.data_.get(), "Source tensor has no content yet.");
// Finally, do sharing.
data_ = src.data_;
capacity_ = src.capacity_;
@@ -313,8 +313,9 @@
template <typename T>
void ShareExternalPointer(T* src, size_t capacity = 0) {
meta_ = TypeMeta::Make<T>();
- CHECK(size_ > 0)
- << "To share data with a raw pointer, you need to set shape first.";
+ CAFFE_ENFORCE(
+ size_ > 0,
+ "To share data with a raw pointer, you need to set shape first.");
data_.reset(src, [](void*)->void {});
// Sets capacity. If not specified, we will implicitly assume that
// the capacity is the current size.
@@ -344,8 +345,9 @@
inline const T* data() const {
CAFFE_ENFORCE(
data_.get() || size_ == 0,
- "The tensor is uninitialized. You probably need to call ",
- "Resize() and mutable_data() first.");
+ "The tensor is of non-zero shape, but its data is not allocated yet. "
+ "Caffe2 uses a lazy allocation, so you will need to call "
+ "mutable_data() or raw_mutable_data() to actually allocate memory.");
CAFFE_ENFORCE(
IsType<T>(),
"Tensor type mistmatch, caller expects elements to be ",
@@ -467,7 +469,7 @@
// Product of all dims up to
inline TIndex size_to_dim(int k) const {
- CHECK(k < dims_.size());
+ CAFFE_ENFORCE(k < dims_.size());
TIndex r = 1;
for (int i = 0; i < k; ++i) {
r *= dims_[i];
@@ -544,11 +546,12 @@
bool SetDims(const vector<T>& src) {
auto old_size = size_;
dims_.resize(src.size());
- size_ = 1;
+ TIndex new_size = 1;
for (int i = 0; i < src.size(); ++i) {
- size_ *= src[i];
+ new_size *= src[i];
dims_[i] = src[i];
}
+ size_ = new_size;
return size_ != old_size;
}
diff --git a/caffe2/core/typeid.cc b/caffe2/core/typeid.cc
index 5b6352b..c7163a4 100644
--- a/caffe2/core/typeid.cc
+++ b/caffe2/core/typeid.cc
@@ -35,5 +35,5 @@
};
static UninitializedTypeNameRegisterer g_uninitialized_type_name_registerer;
-} // namespace
-} // namespace caffe2
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/core/typeid.h b/caffe2/core/typeid.h
index c737b41..4d68c7a 100644
--- a/caffe2/core/typeid.h
+++ b/caffe2/core/typeid.h
@@ -29,10 +29,10 @@
template <typename T>
struct TypeNameRegisterer {
- TypeNameRegisterer() {
+ explicit TypeNameRegisterer(CaffeTypeId id) {
#ifdef __GXX_RTTI
string name = Demangle(typeid(T).name());
- gTypeNames()[reinterpret_cast<CaffeTypeId>(Id())] = name;
+ gTypeNames()[reinterpret_cast<CaffeTypeId>(id)] = name;
// If we are in RTTI mode, we will also use this opportunity to do sanity
// check if there are duplicated ids registered for the same type. This
// usually happens when one does not do RTLD_GLOBAL, which is often the
@@ -42,20 +42,15 @@
if (gRegisteredTypeNames().count(name)) {
std::cerr << "Type name " << name
<< " registered twice. This should "
- "not happen. Are you using RTLD_GLOBAL correctly?"
+ "not happen. Do you have duplicated CAFFE_KNOWN_TYPE?"
<< std::endl;
throw std::runtime_error("TypeNameRegisterer error with type " + name);
}
gRegisteredTypeNames().insert(name);
-#else // __GXX_RTTI
- gTypeNames()[reinterpret_cast<CaffeTypeId>(Id())] =
+#else // __GXX_RTTI
+ gTypeNames()[reinterpret_cast<CaffeTypeId>(id)] =
"(RTTI disabled, cannot show name)";
-#endif // __GXX_RTTI
- }
-
- static CaffeTypeId Id() {
- static bool type_id_bit[1];
- return reinterpret_cast<CaffeTypeId>(type_id_bit);
+#endif // __GXX_RTTI
}
};
@@ -73,20 +68,24 @@
/** Create a dummy TypeMeta object. To create a TypeMeta object for a specific
* type, use TypeMeta::Make<T>().
*/
- TypeMeta() : id_(0), itemsize_(0), ctor_(nullptr), copy_(nullptr),
- dtor_(nullptr) {}
+ TypeMeta()
+ : id_(0), itemsize_(0), ctor_(nullptr), copy_(nullptr), dtor_(nullptr) {}
/**
* Copy constructor.
*/
TypeMeta(const TypeMeta& src)
- : id_(src.id_), itemsize_(src.itemsize_),
- ctor_(src.ctor_), copy_(src.copy_), dtor_(src.dtor_) {}
+ : id_(src.id_),
+ itemsize_(src.itemsize_),
+ ctor_(src.ctor_),
+ copy_(src.copy_),
+ dtor_(src.dtor_) {}
/**
* Assignment operator.
*/
TypeMeta& operator=(const TypeMeta& src) {
- if (this == &src) return *this;
+ if (this == &src)
+ return *this;
id_ = src.id_;
itemsize_ = src.itemsize_;
ctor_ = src.ctor_;
@@ -98,31 +97,45 @@
private:
// TypeMeta can only be created by Make, making sure that we do not
// create incorrectly mixed up TypeMeta objects.
- TypeMeta(CaffeTypeId i, size_t s, PlacementNew ctor, TypedCopy copy,
- TypedDestructor dtor)
+ TypeMeta(
+ CaffeTypeId i,
+ size_t s,
+ PlacementNew ctor,
+ TypedCopy copy,
+ TypedDestructor dtor)
: id_(i), itemsize_(s), ctor_(ctor), copy_(copy), dtor_(dtor) {}
public:
/**
* Returns the type id.
*/
- inline const CaffeTypeId& id() const { return id_; }
+ inline const CaffeTypeId& id() const {
+ return id_;
+ }
/**
* Returns the size of the item.
*/
- inline const size_t& itemsize() const { return itemsize_; }
+ inline const size_t& itemsize() const {
+ return itemsize_;
+ }
/**
* Returns the placement new function pointer for individual items.
*/
- inline PlacementNew ctor() const { return ctor_; }
+ inline PlacementNew ctor() const {
+ return ctor_;
+ }
/**
* Returns the typed copy function pointer for individual iterms.
*/
- inline TypedCopy copy() const { return copy_; }
+ inline TypedCopy copy() const {
+ return copy_;
+ }
/**
* Returns the destructor function pointer for individual items.
*/
- inline TypedDestructor dtor() const { return dtor_; }
+ inline TypedDestructor dtor() const {
+ return dtor_;
+ }
/**
* Returns a printable name for the type.
*/
@@ -131,11 +144,17 @@
assert(it != gTypeNames().end());
return it->second.c_str();
}
- inline bool operator==(const TypeMeta& m) const { return (id_ == m.id_); }
- inline bool operator!=(const TypeMeta& m) const { return (id_ != m.id_); }
+ inline bool operator==(const TypeMeta& m) const {
+ return (id_ == m.id_);
+ }
+ inline bool operator!=(const TypeMeta& m) const {
+ return (id_ != m.id_);
+ }
template <typename T>
- inline bool Match() const { return (id_ == Id<T>()); }
+ inline bool Match() const {
+ return (id_ == Id<T>());
+ }
// Below are static functions that can be called by passing a specific type.
@@ -147,22 +166,29 @@
* is generated during run-time. Do NOT serialize the id for storage.
*/
template <typename T>
- static CaffeTypeId Id() {
- static TypeNameRegisterer<T> registerer;
- return TypeNameRegisterer<T>::Id();
- }
+ static CaffeTypeId Id();
+
/**
* Returns the item size of the type. This is equivalent to sizeof(T).
*/
template <typename T>
- static size_t ItemSize() { return sizeof(T); }
+ static size_t ItemSize() {
+ return sizeof(T);
+ }
/**
* Returns the printable name of the type.
+ *
+ * Works for all types, not only the ones registered with CAFFE_KNOWN_TYPE
*/
template <typename T>
static const char* Name() {
- return gTypeNames()[Id<T>()].c_str();
+#ifdef __GXX_RTTI
+ static string name = Demangle(typeid(T).name());
+ return name.c_str();
+#else // __GXX_RTTI
+ return "(RTTI disabled, cannot show name)";
+#endif // __GXX_RTTI
}
/**
@@ -172,11 +198,10 @@
static void _Ctor(void* ptr, size_t n) {
T* typed_ptr = static_cast<T*>(ptr);
for (int i = 0; i < n; ++i) {
- new(typed_ptr + i) T;
+ new (typed_ptr + i) T;
}
}
-
/**
* Typed copy function for classes.
*/
@@ -219,20 +244,20 @@
return TypeMeta(Id<T>(), ItemSize<T>(), nullptr, nullptr, nullptr);
}
- template <typename T,
- typename std::enable_if<
- !std::is_fundamental<T>::value &&
- std::is_copy_assignable<T>::value>::type* = nullptr>
+ template <
+ typename T,
+ typename std::enable_if<
+ !std::is_fundamental<T>::value &&
+ std::is_copy_assignable<T>::value>::type* = nullptr>
static TypeMeta Make() {
- return TypeMeta(
- Id<T>(), ItemSize<T>(), _Ctor<T>, _Copy<T>, _Dtor<T>);
+ return TypeMeta(Id<T>(), ItemSize<T>(), _Ctor<T>, _Copy<T>, _Dtor<T>);
}
template <typename T>
static TypeMeta Make(
typename std::enable_if<
- !std::is_fundamental<T>::value && !std::is_copy_assignable<T>::value
- >::type* = 0) {
+ !std::is_fundamental<T>::value &&
+ !std::is_copy_assignable<T>::value>::type* = 0) {
return TypeMeta(
Id<T>(), ItemSize<T>(), _Ctor<T>, _CopyNotAllowed<T>, _Dtor<T>);
}
@@ -245,6 +270,28 @@
TypedDestructor dtor_;
};
-} // namespace caffe2
+/**
+ * Register unique id for a type so it can be used in TypeMeta context, e.g. be
+ * used as a type for Blob or for Tensor elements.
+ *
+ * CAFFE_KNOWN_TYPE does explicit instantiation of TypeMeta::Id<T> template
+ * function and thus needs to be put in a single translation unit (.cpp file)
+ * for a given type T. Other translation units that use type T as a type of the
+ * caffe2::Blob or element type of caffe2::Tensor need to depend on the
+ * translation unit that contains CAFFE_KNOWN_TYPE declaration via regular
+ * linkage dependencies.
+ *
+ * NOTE: the macro needs to be invoked in ::caffe2 namespace
+ */
+#define CAFFE_KNOWN_TYPE(T) \
+ template <> \
+ CaffeTypeId TypeMeta::Id<T>() { \
+ static bool type_id_bit[1]; \
+ static TypeNameRegisterer<T> registerer( \
+ reinterpret_cast<CaffeTypeId>(type_id_bit)); \
+ return reinterpret_cast<CaffeTypeId>(type_id_bit); \
+ }
-#endif // CAFFE2_CORE_TYPEID_H_
+} // namespace caffe2
+
+#endif // CAFFE2_CORE_TYPEID_H_
diff --git a/caffe2/core/typeid_test.cc b/caffe2/core/typeid_test.cc
index 0d18964..171bca0 100644
--- a/caffe2/core/typeid_test.cc
+++ b/caffe2/core/typeid_test.cc
@@ -7,6 +7,12 @@
class TypeMetaTestFoo {};
class TypeMetaTestBar {};
+}
+
+CAFFE_KNOWN_TYPE(TypeMetaTestFoo);
+CAFFE_KNOWN_TYPE(TypeMetaTestBar);
+
+namespace {
TEST(TypeMetaTest, TypeMetaStatic) {
EXPECT_EQ(TypeMeta::ItemSize<int>(), sizeof(int));
@@ -63,10 +69,10 @@
EXPECT_EQ(float_meta.itemsize(), TypeMeta::ItemSize<float>());
EXPECT_EQ(foo_meta.itemsize(), TypeMeta::ItemSize<TypeMetaTestFoo>());
EXPECT_EQ(bar_meta.itemsize(), TypeMeta::ItemSize<TypeMetaTestBar>());
- EXPECT_EQ(int_meta.name(), TypeMeta::Name<int>());
- EXPECT_EQ(float_meta.name(), TypeMeta::Name<float>());
- EXPECT_EQ(foo_meta.name(), TypeMeta::Name<TypeMetaTestFoo>());
- EXPECT_EQ(bar_meta.name(), TypeMeta::Name<TypeMetaTestBar>());
+ EXPECT_STREQ(int_meta.name(), TypeMeta::Name<int>());
+ EXPECT_STREQ(float_meta.name(), TypeMeta::Name<float>());
+ EXPECT_STREQ(foo_meta.name(), TypeMeta::Name<TypeMetaTestFoo>());
+ EXPECT_STREQ(bar_meta.name(), TypeMeta::Name<TypeMetaTestBar>());
}
@@ -85,6 +91,12 @@
ClassNoAssignment& operator=(const ClassNoAssignment& src) = delete;
int x;
};
+}
+
+CAFFE_KNOWN_TYPE(ClassAllowAssignment);
+CAFFE_KNOWN_TYPE(ClassNoAssignment);
+
+namespace {
TEST(TypeMetaTest, CtorDtorAndCopy) {
TypeMeta fundamental_meta = TypeMeta::Make<int>();
diff --git a/caffe2/core/types.cc b/caffe2/core/types.cc
index d71eb41..b582872 100644
--- a/caffe2/core/types.cc
+++ b/caffe2/core/types.cc
@@ -1,8 +1,29 @@
#include "caffe2/core/types.h"
#include "caffe2/core/typeid.h"
+#include <atomic>
+#include <memory>
+#include <string>
+#include <vector>
+
namespace caffe2 {
+CAFFE_KNOWN_TYPE(float);
+CAFFE_KNOWN_TYPE(int);
+CAFFE_KNOWN_TYPE(std::string);
+CAFFE_KNOWN_TYPE(bool);
+CAFFE_KNOWN_TYPE(uint8_t);
+CAFFE_KNOWN_TYPE(int8_t);
+CAFFE_KNOWN_TYPE(uint16_t);
+CAFFE_KNOWN_TYPE(int16_t);
+CAFFE_KNOWN_TYPE(int64_t);
+CAFFE_KNOWN_TYPE(float16);
+CAFFE_KNOWN_TYPE(double);
+CAFFE_KNOWN_TYPE(char);
+CAFFE_KNOWN_TYPE(std::unique_ptr<std::mutex>);
+CAFFE_KNOWN_TYPE(std::unique_ptr<std::atomic<bool>>);
+CAFFE_KNOWN_TYPE(std::vector<int64_t>);
+
TensorProto::DataType TypeMetaToDataType(const TypeMeta& meta) {
static_assert(sizeof(int) == 4,
"int in this compiler does not equal to 4 bytes.");
diff --git a/caffe2/core/workspace.cc b/caffe2/core/workspace.cc
index c16ec59..9cd006d 100644
--- a/caffe2/core/workspace.cc
+++ b/caffe2/core/workspace.cc
@@ -2,6 +2,7 @@
#include <algorithm>
#include <ctime>
+#include <mutex>
#include "caffe2/core/logging.h"
#include "caffe2/core/operator.h"
@@ -290,14 +291,24 @@
auto substepShouldContinue = [&, externalShouldContinue](int64_t iter) {
return !got_failure && externalShouldContinue(iter);
};
+ std::mutex exception_mutex;
+ std::exception_ptr first_exception;
auto worker = [&]() {
while (true) {
int substep_id = next_substep++;
if (got_failure || (substep_id >= step.substep().size())) {
break;
}
- if (!ExecuteStepRecursive(
- step.substep().Get(substep_id), substepShouldContinue)) {
+ try {
+ if (!ExecuteStepRecursive(
+ step.substep().Get(substep_id), substepShouldContinue)) {
+ got_failure = true;
+ }
+ } catch (const std::exception& ex) {
+ std::lock_guard<std::mutex> guard(exception_mutex);
+ if (!first_exception) {
+ first_exception = std::current_exception();
+ }
got_failure = true;
}
}
@@ -311,6 +322,10 @@
thread.join();
}
if (got_failure) {
+ LOG(ERROR) << "One of the workers died with an unhandled exception";
+ if (first_exception != nullptr) {
+ std::rethrow_exception(first_exception);
+ }
return false;
}
// concurrent substeps should be careful about setting should_stop_blob
diff --git a/caffe2/core/workspace.h b/caffe2/core/workspace.h
index 31ae3e6..ad43296 100644
--- a/caffe2/core/workspace.h
+++ b/caffe2/core/workspace.h
@@ -18,12 +18,18 @@
class NetBase;
struct StopOnSignal {
- StopOnSignal(): handler_(SignalHandler::Action::STOP,
- SignalHandler::Action::STOP) {}
+ StopOnSignal()
+ : handler_(std::make_shared<SignalHandler>(
+ SignalHandler::Action::STOP,
+ SignalHandler::Action::STOP)) {}
+
+ StopOnSignal(const StopOnSignal& other) : handler_(other.handler_) {}
+
bool operator()(int iter) {
- return handler_.CheckForSignals() != SignalHandler::Action::STOP;
+ return handler_->CheckForSignals() != SignalHandler::Action::STOP;
}
- SignalHandler handler_;
+
+ std::shared_ptr<SignalHandler> handler_;
};
/**
diff --git a/caffe2/core/workspace_test.cc b/caffe2/core/workspace_test.cc
index 91fbd18..5ac3f44 100644
--- a/caffe2/core/workspace_test.cc
+++ b/caffe2/core/workspace_test.cc
@@ -8,6 +8,8 @@
class WorkspaceTestFoo {};
+CAFFE_KNOWN_TYPE(WorkspaceTestFoo);
+
TEST(WorkspaceTest, BlobAccess) {
Workspace ws;
@@ -71,5 +73,3 @@
}
} // namespace caffe2
-
-
diff --git a/caffe2/cuda_rtc/common_rtc.h b/caffe2/cuda_rtc/common_rtc.h
index 3824a24..7ab419f 100644
--- a/caffe2/cuda_rtc/common_rtc.h
+++ b/caffe2/cuda_rtc/common_rtc.h
@@ -73,8 +73,8 @@
unsigned int bx, unsigned int by, unsigned int bz,
unsigned int shared_mem, cudaStream_t stream,
Args... args) {
- CHECK(module_loaded_)
- << "Cannot call Launch before a module is loaded.";
+ CAFFE_ENFORCE(
+ module_loaded_, "Cannot call Launch before a module is loaded.");
void * args_voidp[] = {&args...};
CUDA_DRIVERAPI_CHECK(cuLaunchKernel(
kernel_, gx, gy, gz, bx, by, bz, shared_mem, stream,
@@ -85,8 +85,8 @@
unsigned int bx, unsigned int by, unsigned int bz,
unsigned int shared_mem, cudaStream_t stream,
void** extra) {
- CHECK(module_loaded_)
- << "Cannot call Launch before a module is loaded.";
+ CAFFE_ENFORCE(
+ module_loaded_, "Cannot call Launch before a module is loaded.");
CUDA_DRIVERAPI_CHECK(cuLaunchKernel(
kernel_, gx, gy, gz, bx, by, bz, shared_mem, stream,
nullptr, extra));
diff --git a/caffe2/cuda_rtc/elemenntwise_rtc_gpu.cc b/caffe2/cuda_rtc/elemenntwise_rtc_gpu.cc
index 71d36ab..b766ff4 100644
--- a/caffe2/cuda_rtc/elemenntwise_rtc_gpu.cc
+++ b/caffe2/cuda_rtc/elemenntwise_rtc_gpu.cc
@@ -75,7 +75,7 @@
: Operator<CUDAContext>(operator_def, ws) {
const string src = OperatorBase::GetSingleArgument<string>(
"rtc_src", "");
- CHECK(src.size()) << "Op should have a non-zero source code size.";
+ CAFFE_ENFORCE(src.size(), "Op should have a non-zero source code size.");
func_.Compile(InputSize(), OutputSize(), src);
}
~ElementwiseRTCOp() {}
@@ -85,8 +85,9 @@
"The argbuffer relies on the assumption that void* and "
"size_t have the same size.");
size_t argBuffer[InputSize() + OutputSize() + 1];
- CHECK(Input(0).size() < std::numeric_limits<int>::max())
- << "The kernel function currently only supports int index.";
+ CAFFE_ENFORCE(
+ Input(0).size() < std::numeric_limits<int>::max(),
+ "The kernel function currently only supports int index.");
argBuffer[0] = Input(0).size();
void** ptr_buffer = reinterpret_cast<void**>(argBuffer + 1);
for (int i = 0; i < InputSize(); ++i) {
diff --git a/caffe2/db/create_db_op.h b/caffe2/db/create_db_op.h
index bc345be..2650ad2 100644
--- a/caffe2/db/create_db_op.h
+++ b/caffe2/db/create_db_op.h
@@ -15,18 +15,25 @@
db_type_(OperatorBase::template GetSingleArgument<string>(
"db_type",
"leveldb")),
- db_name_(OperatorBase::template GetSingleArgument<string>("db", "")) {
+ db_name_(OperatorBase::template GetSingleArgument<string>("db", "")),
+ num_shards_(
+ OperatorBase::template GetSingleArgument<int>("num_shards", 1)),
+ shard_id_(
+ OperatorBase::template GetSingleArgument<int>("shard_id", 0)) {
CHECK_GT(db_name_.size(), 0) << "Must specify a db name.";
}
bool RunOnDevice() final {
- OperatorBase::Output<db::DBReader>(0)->Open(db_type_, db_name_);
+ OperatorBase::Output<db::DBReader>(0)->Open(
+ db_type_, db_name_, num_shards_, shard_id_);
return true;
}
private:
string db_type_;
string db_name_;
+ uint32_t num_shards_;
+ uint32_t shard_id_;
DISABLE_COPY_AND_ASSIGN(CreateDBOp);
};
diff --git a/caffe2/db/db_test.cc b/caffe2/db/db_test.cc
index e0e9820..c769c65 100644
--- a/caffe2/db/db_test.cc
+++ b/caffe2/db/db_test.cc
@@ -151,5 +151,50 @@
EXPECT_EQ(keys_set.size(), kMaxItems);
}
+TEST(DBReaderShardedTest, Reader) {
+ std::string name = std::tmpnam(nullptr);
+ CreateAndFill("leveldb", name);
+
+ std::unique_ptr<DBReader> reader0(new DBReader("leveldb", name, 3, 0));
+ string key;
+ string value;
+ reader0->Read(&key, &value);
+ EXPECT_EQ(key, "00");
+ EXPECT_EQ(value, "00");
+ reader0->Read(&key, &value);
+ EXPECT_EQ(key, "03");
+ EXPECT_EQ(value, "03");
+ reader0->Read(&key, &value);
+ EXPECT_EQ(key, "06");
+ EXPECT_EQ(value, "06");
+ reader0->Read(&key, &value);
+ EXPECT_EQ(key, "09");
+ EXPECT_EQ(value, "09");
+ reader0->Read(&key, &value);
+ EXPECT_EQ(key, "00");
+ EXPECT_EQ(value, "00");
+ reader0->Read(&key, &value);
+ EXPECT_EQ(key, "03");
+ EXPECT_EQ(value, "03");
+
+ CreateAndFill("leveldb", name + "1");
+ std::unique_ptr<DBReader> reader1(new DBReader("leveldb", name + "1", 3, 1));
+ reader1->Read(&key, &value);
+ EXPECT_EQ(key, "01");
+ EXPECT_EQ(value, "01");
+ reader1->Read(&key, &value);
+ EXPECT_EQ(key, "04");
+ EXPECT_EQ(value, "04");
+
+ CreateAndFill("leveldb", name + "2");
+ std::unique_ptr<DBReader> reader2(new DBReader("leveldb", name + "2", 3, 2));
+ reader2->Read(&key, &value);
+ EXPECT_EQ(key, "02");
+ EXPECT_EQ(value, "02");
+ reader2->Read(&key, &value);
+ EXPECT_EQ(key, "05");
+ EXPECT_EQ(value, "05");
+}
+
} // namespace db
} // namespace caffe2
diff --git a/caffe2/db/rocksdb.cc b/caffe2/db/rocksdb.cc
index af6539c..61753e0 100644
--- a/caffe2/db/rocksdb.cc
+++ b/caffe2/db/rocksdb.cc
@@ -42,8 +42,8 @@
void Commit() override {
rocksdb::Status status = db_->Write(rocksdb::WriteOptions(), batch_.get());
batch_.reset(new rocksdb::WriteBatch());
- CHECK(status.ok()) << "Failed to write batch to rocksdb "
- << std::endl << status.ToString();
+ CAFFE_ENFORCE(
+ status.ok(), "Failed to write batch to rocksdb: " + status.ToString());
}
private:
diff --git a/caffe2/image/image_input_op.h b/caffe2/image/image_input_op.h
index 5be044e..dbc09b3 100644
--- a/caffe2/image/image_input_op.h
+++ b/caffe2/image/image_input_op.h
@@ -110,7 +110,7 @@
if (use_caffe_datum_) {
// The input is a caffe datum format.
caffe::Datum datum;
- CHECK(datum.ParseFromString(value));
+ CAFFE_ENFORCE(datum.ParseFromString(value));
*label = datum.label();
if (datum.encoded()) {
// encoded image in datum.
@@ -123,8 +123,8 @@
*img = cv::Mat(datum.height(), datum.width(),
color_ ? CV_8UC3 : CV_8UC1);
// Note(Yangqing): I believe that the mat should be created continuous.
- CHECK(img->isContinuous());
- CHECK((color_ && datum.channels() == 3) || datum.channels() == 1);
+ CAFFE_ENFORCE(img->isContinuous());
+ CAFFE_ENFORCE((color_ && datum.channels() == 3) || datum.channels() == 1);
if (datum.channels() == 1) {
memcpy(img->ptr<uchar>(0), datum.data().data(), datum.data().size());
} else {
@@ -146,7 +146,7 @@
} else {
// The input is a caffe2 format.
TensorProtos protos;
- CHECK(protos.ParseFromString(value));
+ CAFFE_ENFORCE(protos.ParseFromString(value));
const TensorProto& image_proto = protos.protos(0);
const TensorProto& label_proto = protos.protos(1);
if (image_proto.data_type() == TensorProto::STRING) {
@@ -166,7 +166,7 @@
<< "Image height must be bigger than crop.";
CHECK_GE(image_proto.dims(1), crop_)
<< "Image width must be bigger than crop.";
- CHECK(!color_ || image_proto.dims(2) == 3);
+ CAFFE_ENFORCE(!color_ || image_proto.dims(2) == 3);
*img = cv::Mat(
image_proto.dims(0), image_proto.dims(1), color_ ? CV_8UC3 : CV_8UC1);
memcpy(img->ptr<uchar>(0), image_proto.byte_data().data(),
@@ -214,7 +214,7 @@
cv::Mat scaled_img;
// process data
reader_->Read(&key, &value);
- CHECK(GetImageAndLabelFromDBValue(value, &img, &label));
+ CAFFE_ENFORCE(GetImageAndLabelFromDBValue(value, &img, &label));
// deal with scaling.
int scaled_width, scaled_height;
if (warp_) {
diff --git a/caffe2/mpi/mpi_common.cc b/caffe2/mpi/mpi_common.cc
index 4ee1aec..65f749a 100644
--- a/caffe2/mpi/mpi_common.cc
+++ b/caffe2/mpi/mpi_common.cc
@@ -2,10 +2,13 @@
#include <thread>
+#include "caffe2/core/typeid.h"
#include "caffe2/utils/proto_utils.h"
namespace caffe2 {
+CAFFE_KNOWN_TYPE(MPICommonWorldWrapper);
+
static std::mutex gCaffe2MPIMutex;
std::mutex& MPIMutex() {
diff --git a/caffe2/mpi/mpi_ops_gpu.cc b/caffe2/mpi/mpi_ops_gpu.cc
index 53080fd..6383bc9 100644
--- a/caffe2/mpi/mpi_ops_gpu.cc
+++ b/caffe2/mpi/mpi_ops_gpu.cc
@@ -9,41 +9,38 @@
// version supports CUDA aware MPI functions or not.
#if OPEN_MPI
- #define CAFFE2_OMPI_VERSION \
- OMPI_MAJOR_VERSION * 10000 + OMPI_MINOR_VERSION * 100 + \
- OMPI_RELEASE_VERSION
- #if CAFFE2_OMPI_VERSION >= 20000
- // openmpi 2.x now supports compile time check whether cuda support is
- // built with openmpi.
- #include "mpi-ext.h" /* Needed for CUDA-aware check */
- #if MPIX_CUDA_AWARE_SUPPORT
+#define CAFFE2_OMPI_VERSION \
+ OMPI_MAJOR_VERSION * 10000 + OMPI_MINOR_VERSION * 100 + OMPI_RELEASE_VERSION
+#if CAFFE2_OMPI_VERSION >= 20000
+// OpenMPI 2.x now supports compile time check whether CUDA is supported.
+#include "mpi-ext.h" /* Needed for CUDA-aware check */
+#if MPIX_CUDA_AWARE_SUPPORT
#define CAFFE2_HAS_CUDA_MPI_BASICS 1
#define CAFFE2_HAS_CUDA_MPI_ALLREDUCE 1
#endif // MPIX_CUDA_AWARE_SUPPORT
#else // CAFFE2_OMPI_VERSION >= 2000
-// In the case of openmpi 1.x, we don't have compile-time flags to figure
-// out if cuda is built; as a result, we will assume that the user has built
-// openmpi with cuda.
-// CUDA-aware MPIBroadcast is introduced after openmpi 1.7.
+// In the case of OpenMPI 1.x, we don't have compile-time flags to
+// figure out if CUDA is supported; as a result, we will assume that
+// the user has built OpenMPI with CUDA support.
+// CUDA-aware MPIBroadcast is introduced after OpenMPI 1.7.
#if CAFFE2_OMPI_VERSION >= 10700
#define CAFFE2_HAS_CUDA_MPI_BASICS 1
#else // CAFFE2_OMPI_VERSION >= 10700
#define CAFFE2_HAS_CUDA_MPI_BASICS 0
#endif // CAFFE2_OMPI_VERSION >= 10700
-
-// CUDA-aware MPIAllreduce is introduced after openmpi 1.8.5.
+// CUDA-aware MPIAllreduce is introduced after OpenMPI 1.8.5.
#if CAFFE2_OMPI_VERSION >= 10805
#define CAFFE2_HAS_CUDA_MPI_ALLREDUCE 1
#else // CAFFE2_OMPI_VERSION >= 10805
#define CAFFE2_HAS_CUDA_MPI_ALLREDUCE 0
#endif // CAFFE2_OMPI_VERSION >= 10805
#endif // CAFFE2_OMPI_VERSION >= 2000
-#else // !OPEN_MPI
- // We have not really tested against other MPI environments, so let's go for a
- // safe path and basically say we don't have cuda-aware functions.
+#else // !OPEN_MPI
+// We have not really tested against other MPI environments, so let's go for a
+// safe path and basically say we don't have cuda-aware functions.
#define CAFFE2_HAS_CUDA_MPI_BASICS 0
#define CAFFE2_HAS_CUDA_MPI_ALLREDUCE 0
-#endif // OPEN_MPI
+#endif // OPEN_MPI
// We allow a macro to force using fallback functions.
#ifdef CAFFE2_FORCE_FALLBACK_CUDA_MPI
@@ -51,7 +48,7 @@
#undef CAFFE2_HAS_CUDA_MPI_ALLREDUCE
#define CAFFE2_HAS_CUDA_MPI_BASICS 0
#define CAFFE2_HAS_CUDA_MPI_ALLREDUCE 0
-#endif // CAFFE2_FORCE_FALLBACK_CUDA_MPI
+#endif // CAFFE2_FORCE_FALLBACK_CUDA_MPI
namespace {
diff --git a/caffe2/mpi/mpi_python.cc b/caffe2/mpi/mpi_python.cc
new file mode 100644
index 0000000..b82ef32
--- /dev/null
+++ b/caffe2/mpi/mpi_python.cc
@@ -0,0 +1,36 @@
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "caffe2/caffe2/mpi/mpi_common.h"
+
+namespace caffe2 {
+
+namespace py = pybind11;
+
+PYBIND11_PLUGIN(mpi) {
+ py::module m("mpi", "MPI helper functions");
+ m.def(
+ "SetupPeers",
+ &MPISetupPeers,
+ py::arg("replicas"),
+ py::arg("role"),
+ py::arg("job_path"));
+ m.def("CommSize", [] {
+ auto comm = GlobalMPIComm();
+ return MPICommSize(comm);
+ });
+ m.def("CommRank", [] {
+ auto comm = GlobalMPIComm();
+ return MPICommRank(comm);
+ });
+ m.def("Finalize", [] {
+ // NOTE(pietern): Doesn't seem to work when calling it
+ // from Python. It ends up calling pthread_join on a
+ // thread that doesn't exit. For now, running mpirun
+ // with `-quiet` and skipping the finalize call.
+ MPI_Finalize();
+ });
+ return m.ptr();
+}
+
+} // namespace caffe2
diff --git a/caffe2/operators/batch_matmul_op.cc b/caffe2/operators/batch_matmul_op.cc
new file mode 100644
index 0000000..5a44691
--- /dev/null
+++ b/caffe2/operators/batch_matmul_op.cc
@@ -0,0 +1,124 @@
+#include "caffe2/operators/batch_matmul_op.h"
+
+namespace caffe2 {
+namespace {
+
+REGISTER_CPU_OPERATOR(BatchMatMul, BatchMatMulOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(BatchMatMul)
+ .NumInputs(2)
+ .NumOutputs(1)
+ .SetDoc(R"DOC(
+Batch Matrix multiplication Yi = Ai * Bi, where A has size (C x M x K), B has
+size (C x K x N) where C is the batch size and i ranges from 0 to C-1.
+)DOC")
+ .Input(0, "A", "3D matrix of size (C x M x K)")
+ .Input(1, "B", "3D matrix of size (C x K x N)")
+ .Output(0, "Y", "3D matrix of size (C x M x N)")
+ .Arg("trans_a", "Pass 1 to transpose A before multiplication")
+ .Arg("trans_b", "Pass 1 to transpose B before multiplication");
+
+class GetBatchMatMulGradient : public GradientMakerBase {
+ using GradientMakerBase::GradientMakerBase;
+ vector<OperatorDef> GetGradientDefs() override {
+ CHECK_EQ(def_.input_size(), 2);
+
+ bool trans_a = 0;
+ bool trans_b = 0;
+
+ if (HasArgument(Def(), "trans_a")) {
+ trans_a = GetArgument(Def(), "trans_a").i();
+ }
+ if (HasArgument(Def(), "trans_b")) {
+ trans_b = GetArgument(Def(), "trans_b").i();
+ }
+
+ const auto no_trans_arg = vector<Argument>();
+ const auto trans_a_arg = vector<Argument>{
+ MakeArgument<int>("trans_a", 1)};
+ const auto trans_b_arg = vector<Argument>{
+ MakeArgument<int>("trans_b", 1)};
+ const auto trans_both_arg = vector<Argument>{
+ MakeArgument<int>("trans_a", 1),
+ MakeArgument<int>("trans_b", 1)};
+
+ if (trans_a) {
+ if (trans_b) {
+ // A'B':
+ // dA = B'G', dB = G'A'
+ return vector<OperatorDef>{
+ CreateOperatorDef(
+ "BatchMatMul",
+ "",
+ vector<string>{I(1), GO(0)},
+ vector<string>{GI(0)},
+ trans_both_arg),
+ CreateOperatorDef(
+ "BatchMatMul",
+ "",
+ vector<string>{GO(0), I(0)},
+ vector<string>{GI(1)},
+ trans_both_arg)};
+ } else {
+ // A'B:
+ // dA = BG', dB = AG
+ return vector<OperatorDef>{
+ CreateOperatorDef(
+ "BatchMatMul",
+ "",
+ vector<string>{I(1), GO(0)},
+ vector<string>{GI(0)},
+ trans_b_arg),
+ CreateOperatorDef(
+ "BatchMatMul",
+ "",
+ vector<string>{I(0), GO(0)},
+ vector<string>{GI(1)},
+ no_trans_arg)};
+ }
+ } else {
+ if (trans_b) {
+ // AB':
+ // dA = GB, dB = G'A
+ return vector<OperatorDef>{
+ CreateOperatorDef(
+ "BatchMatMul",
+ "",
+ vector<string>{GO(0), I(1)},
+ vector<string>{GI(0)},
+ no_trans_arg),
+ CreateOperatorDef(
+ "BatchMatMul",
+ "",
+ vector<string>{GO(0), I(0)},
+ vector<string>{GI(1)},
+ trans_a_arg)};
+ } else {
+ // AB:
+ // dA = GB', dB = A'G
+ return vector<OperatorDef>{
+ CreateOperatorDef(
+ "BatchMatMul",
+ "",
+ vector<string>{GO(0), I(1)},
+ vector<string>{GI(0)},
+ trans_b_arg),
+ CreateOperatorDef(
+ "BatchMatMul",
+ "",
+ vector<string>{I(0), GO(0)},
+ vector<string>{GI(1)},
+ trans_a_arg)};
+ }
+ }
+ }
+
+ bool CopyArguments() const override {
+ return false;
+ }
+};
+
+REGISTER_GRADIENT(BatchMatMul, GetBatchMatMulGradient);
+
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/operators/batch_matmul_op.h b/caffe2/operators/batch_matmul_op.h
new file mode 100644
index 0000000..33e57c8
--- /dev/null
+++ b/caffe2/operators/batch_matmul_op.h
@@ -0,0 +1,90 @@
+#ifndef CAFFE2_OPERATORS_MATMUL_OP_H_
+#define CAFFE2_OPERATORS_MATMUL_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context, class Engine = DefaultEngine>
+class BatchMatMulOp final : public Operator<Context> {
+ public:
+ USE_OPERATOR_CONTEXT_FUNCTIONS;
+ BatchMatMulOp(const OperatorDef& operator_def, Workspace* ws)
+ : Operator<Context>(operator_def, ws),
+ trans_a_(OperatorBase::GetSingleArgument<int>("trans_a", 0)),
+ trans_b_(OperatorBase::GetSingleArgument<int>("trans_b", 0)) {}
+ ~BatchMatMulOp() {}
+
+ bool RunOnDevice() override {
+ const auto& A = Input(0);
+ const auto& B = Input(1);
+ auto* Y = Output(0);
+
+ CAFFE_ENFORCE_EQ(A.ndim(), 3);
+ CAFFE_ENFORCE_EQ(B.ndim(), 3);
+ CAFFE_ENFORCE_EQ(A.dim32(0), B.dim32(0));
+
+ int a_dim0, a_dim1, b_dim0, b_dim1;
+
+ if (trans_a_) {
+ a_dim0 = A.dim32(2);
+ a_dim1 = A.dim32(1);
+ } else {
+ a_dim0 = A.dim32(1);
+ a_dim1 = A.dim32(2);
+ }
+
+ if (trans_b_) {
+ b_dim0 = B.dim32(2);
+ b_dim1 = B.dim32(1);
+ } else {
+ b_dim0 = B.dim32(1);
+ b_dim1 = B.dim32(2);
+ }
+
+ // Error checking
+ CAFFE_ENFORCE(
+ a_dim1 == b_dim0,
+ "Dimension mismatch: ",
+ trans_a_ ? "trans(A): " : "A: ",
+ a_dim0,
+ " ",
+ a_dim1,
+ trans_b_ ? ", trans(B): " : ", B: ",
+ b_dim0,
+ " ",
+ b_dim1);
+
+ Y->Resize(A.dim(0), a_dim0, b_dim1);
+
+ // Y = A * B
+ auto a_offset = A.size() / A.dim(0);
+ auto b_offset = B.size() / B.dim(0);
+ auto y_offset = a_dim0 * b_dim1;
+ for (int i = 0; i < A.dim32(0); ++i) {
+ math::Gemm<T, Context, Engine>(
+ trans_a_ ? CblasTrans : CblasNoTrans,
+ trans_b_ ? CblasTrans : CblasNoTrans,
+ a_dim0,
+ b_dim1,
+ a_dim1,
+ 1,
+ A.template data<T>() + a_offset * i,
+ B.template data<T>() + b_offset * i,
+ 0,
+ Y->template mutable_data<T>() + y_offset * i,
+ &context_);
+ }
+ return true;
+ }
+
+ protected:
+ bool trans_a_;
+ bool trans_b_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_MATMUL_OP_H_
diff --git a/caffe2/operators/batch_matmul_op_gpu.cc b/caffe2/operators/batch_matmul_op_gpu.cc
new file mode 100644
index 0000000..bfd77ae
--- /dev/null
+++ b/caffe2/operators/batch_matmul_op_gpu.cc
@@ -0,0 +1,10 @@
+#include "caffe2/operators/batch_matmul_op.h"
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+namespace {
+
+REGISTER_CUDA_OPERATOR(BatchMatMul, BatchMatMulOp<float, CUDAContext>);
+}
+}
diff --git a/caffe2/operators/communicator_op.cc b/caffe2/operators/communicator_op.cc
index 9679dcc..b2ea58b 100644
--- a/caffe2/operators/communicator_op.cc
+++ b/caffe2/operators/communicator_op.cc
@@ -51,7 +51,7 @@
.NumInputs(2)
.NumOutputs(1)
.SetDoc(R"DOC(
-Does an allgather operation among the nodes. Currently only Sum is supported.
+Does an allgather operation among the nodes.
)DOC")
.Input(0, "comm_world", "The common world.")
.Input(1, "X", "A tensor to be allgathered.")
diff --git a/caffe2/operators/concat_split_op.h b/caffe2/operators/concat_split_op.h
index 0ed412b..41845d6 100644
--- a/caffe2/operators/concat_split_op.h
+++ b/caffe2/operators/concat_split_op.h
@@ -30,10 +30,10 @@
SplitOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws),
split_(OperatorBase::GetRepeatedArgument<int>("split")) {
- CHECK(OperatorBase::HasArgument("axis") ^
- OperatorBase::HasArgument("order"))
- << "You should either specify the dim to split, or the order "
- "in the case of 4-D images.";
+ CAFFE_ENFORCE(
+ OperatorBase::HasArgument("axis") ^ OperatorBase::HasArgument("order"),
+ "You should either specify the dim to split, or the order "
+ "in the case of 4-D images.");
if (OperatorBase::HasArgument("axis")) {
axis_ = OperatorBase::GetSingleArgument<int>("axis", -1);
} else {
@@ -58,10 +58,10 @@
USE_OPERATOR_CONTEXT_FUNCTIONS;
ConcatOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws) {
- CHECK(OperatorBase::HasArgument("axis") ^
- OperatorBase::HasArgument("order"))
- << "You should either specify the dim to split, or the order "
- "in the case of 4-D images.";
+ CAFFE_ENFORCE(
+ OperatorBase::HasArgument("axis") ^ OperatorBase::HasArgument("order"),
+ "You should either specify the dim to split, or the order "
+ "in the case of 4-D images.");
if (OperatorBase::HasArgument("axis")) {
axis_ = OperatorBase::GetSingleArgument<int>("axis", -1);
} else {
diff --git a/caffe2/operators/conv_op_cudnn.cc b/caffe2/operators/conv_op_cudnn.cc
index 19667ab..70892ac 100644
--- a/caffe2/operators/conv_op_cudnn.cc
+++ b/caffe2/operators/conv_op_cudnn.cc
@@ -45,7 +45,7 @@
deterministic_(
OperatorBase::GetSingleArgument<int>("deterministic", 0)),
cudnn_state_(OperatorBase::GetSingleArgument<int>("cudnn_state", 0)) {
- CHECK(!deterministic_ || !exhaustive_search_);
+ CAFFE_ENFORCE(!deterministic_ || !exhaustive_search_);
CUDNN_CHECK(cudnnCreateTensorDescriptor(&bottom_desc_));
CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc_));
diff --git a/caffe2/operators/conv_op_impl.h b/caffe2/operators/conv_op_impl.h
index c6e1341..d25ece8 100644
--- a/caffe2/operators/conv_op_impl.h
+++ b/caffe2/operators/conv_op_impl.h
@@ -3,24 +3,33 @@
#define CAFFE2_OPERATORS_CONV_OP_IMPL_H_
#include "caffe2/core/context.h"
+#include "caffe2/core/flags.h"
+#include "caffe2/core/logging.h"
#include "caffe2/core/operator.h"
#include "caffe2/operators/conv_op.h"
+#include "caffe2/operators/conv_op_shared.h"
#include "caffe2/operators/conv_pool_op_base.h"
#include "caffe2/utils/math.h"
-#include "caffe2/core/logging.h"
+
+CAFFE2_DECLARE_bool(caffe2_force_shared_col_buffer);
namespace caffe2 {
template <typename T, class Context>
bool ConvOp<T, Context>::RunOnDeviceWithOrderNCHW() {
- auto& X = Input(INPUT);
+ const Tensor<Context>& X = Input(INPUT);
auto& filter = Input(FILTER);
auto& bias = Input(BIAS);
- auto* Y = Output(0);
+ Tensor<Context>* Y = Output(0);
const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
CAFFE_ENFORCE(4 == filter.ndim());
const int M = filter.dim32(0);
- CAFFE_ENFORCE(C == filter.dim32(1));
+ CAFFE_ENFORCE(
+ C == filter.dim32(1),
+ "Convolution op: # of input channels does not match: # of input channels ",
+ C,
+ " is not equal to kernel channels:",
+ filter.dim32(1));
CAFFE_ENFORCE(filter.dim32(2) == kernel_h_);
CAFFE_ENFORCE(filter.dim32(3) == kernel_w_);
CAFFE_ENFORCE(bias.ndim() == 1);
@@ -36,51 +45,77 @@
const int output_image_size = Y->dim32(2) * Y->dim32(3);
// The col buffer is stored in CHW order as well - kernel_dim, and the height
// and width.
- col_buffer_.Resize(vector<TIndex>{
- C, kernel_h_, kernel_w_, Y->dim32(2), Y->dim32(3)});
+ const T* Xdata = X.template data<T>();
if (bias_multiplier_.size() != output_image_size) {
// If the helper bias multiplier is not M, reshape and fill it with one.
bias_multiplier_.Resize(vector<TIndex>(1, output_image_size));
math::Set<T, Context>(
- output_image_size, static_cast<T>(1),
- bias_multiplier_.template mutable_data<T>(), &context_);
+ output_image_size,
+ static_cast<T>(1),
+ bias_multiplier_.template mutable_data<T>(),
+ &context_);
}
- const T* Xdata = X.template data<T>();
- T* col_buffer_data = col_buffer_.template mutable_data<T>();
T* Ydata = Y->template mutable_data<T>();
- // Im2col, followed by gemm.
- for (int image_id = 0; image_id < N; ++image_id) {
- math::Im2col<T, Context, StorageOrder::NCHW>(
- Xdata,
- C,
- H,
- W,
- kernel_h_,
- kernel_w_,
- dilation_h_,
- dilation_w_,
- pad_t_,
- pad_l_,
- pad_b_,
- pad_r_,
- stride_h_,
- stride_w_,
- col_buffer_data,
- &context_);
- // Weight term
- math::Gemm<T, Context>(
- CblasNoTrans, CblasNoTrans, M, output_image_size, kernel_dim,
- 1, filter.template data<T>(), col_buffer_data,
- 0, Ydata,
- &context_);
- // Bias term
- math::Gemm<T, Context>(
- CblasNoTrans, CblasNoTrans, M, output_image_size, 1, 1,
- bias.template data<T>(), bias_multiplier_.template data<T>(),
- 1, Ydata,
- &context_);
- Xdata += input_offset;
- Ydata += output_offset;
+
+ auto f = [&](Tensor<Context>* col_buffer) {
+ col_buffer->Resize(
+ vector<TIndex>{C, kernel_h_, kernel_w_, Y->dim32(2), Y->dim32(3)});
+
+ T* col_buffer_data = col_buffer->template mutable_data<T>();
+ // Im2col, followed by gemm.
+ for (int image_id = 0; image_id < N; ++image_id) {
+ math::Im2col<T, Context, StorageOrder::NCHW>(
+ Xdata,
+ C,
+ H,
+ W,
+ kernel_h_,
+ kernel_w_,
+ dilation_h_,
+ dilation_w_,
+ pad_t_,
+ pad_l_,
+ pad_b_,
+ pad_r_,
+ stride_h_,
+ stride_w_,
+ col_buffer_data,
+ &context_);
+ // Weight term
+ math::Gemm<T, Context>(
+ CblasNoTrans,
+ CblasNoTrans,
+ M,
+ output_image_size,
+ kernel_dim,
+ 1,
+ filter.template data<T>(),
+ col_buffer_data,
+ 0,
+ Ydata,
+ &context_);
+ // Bias term
+ math::Gemm<T, Context>(
+ CblasNoTrans,
+ CblasNoTrans,
+ M,
+ output_image_size,
+ 1,
+ 1,
+ bias.template data<T>(),
+ bias_multiplier_.template data<T>(),
+ 1,
+ Ydata,
+ &context_);
+ Xdata += input_offset;
+ Ydata += output_offset;
+ }
+ };
+
+ if (FLAGS_caffe2_force_shared_col_buffer || shared_buffer_) {
+ runWithSharedBuffer<Context>(ws_, f);
+ } else {
+ f(&col_buffer_);
}
return true;
}
@@ -88,10 +123,10 @@
// The implementations.
template <typename T, class Context>
bool ConvOp<T, Context>::RunOnDeviceWithOrderNHWC() {
- auto& X = Input(INPUT);
+ const Tensor<Context>& X = Input(INPUT);
auto& filter = Input(FILTER);
auto& bias = Input(BIAS);
- auto* Y = Output(0);
+ Tensor<Context>* Y = Output(0);
const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), C = X.dim32(3);
CAFFE_ENFORCE(4 == filter.ndim());
const int M = filter.dim32(0);
@@ -147,41 +182,64 @@
output_image_size, static_cast<T>(1),
bias_multiplier_.template mutable_data<T>(), &context_);
}
- col_buffer_.Resize(vector<TIndex>{
- Y->dim32(1), Y->dim32(2), kernel_h_, kernel_w_, C});
- T* col_buffer_data = col_buffer_.template mutable_data<T>();
- // Im2col, followed by gemm.
- for (int image_id = 0; image_id < N; ++image_id) {
- math::Im2col<T, Context, StorageOrder::NHWC>(
- Xdata,
- C,
- H,
- W,
- kernel_h_,
- kernel_w_,
- dilation_h_,
- dilation_w_,
- pad_t_,
- pad_l_,
- pad_b_,
- pad_r_,
- stride_h_,
- stride_w_,
- col_buffer_data,
- &context_);
- // Weight term
- // Wait, is this right....?
- math::Gemm<T, Context>(
- CblasNoTrans, CblasTrans, output_image_size, M, kernel_dim,
- 1, col_buffer_data, filter.template data<T>(), 0, Ydata,
- &context_);
- // Bias term
- math::Gemm<T, Context>(
- CblasNoTrans, CblasNoTrans, output_image_size, M, 1, 1,
- bias_multiplier_.template data<T>(), bias.template data<T>(), 1,
- Ydata, &context_);
- Xdata += input_offset;
- Ydata += output_offset;
+ auto f = [&](Tensor<Context>* col_buffer) {
+ col_buffer->Resize(
+ vector<TIndex>{Y->dim32(1), Y->dim32(2), kernel_h_, kernel_w_, C});
+ T* col_buffer_data = col_buffer->template mutable_data<T>();
+ // Im2col, followed by gemm.
+ for (int image_id = 0; image_id < N; ++image_id) {
+ math::Im2col<T, Context, StorageOrder::NHWC>(
+ Xdata,
+ C,
+ H,
+ W,
+ kernel_h_,
+ kernel_w_,
+ dilation_h_,
+ dilation_w_,
+ pad_t_,
+ pad_l_,
+ pad_b_,
+ pad_r_,
+ stride_h_,
+ stride_w_,
+ col_buffer_data,
+ &context_);
+ // Weight term
+ // Wait, is this right....?
+ math::Gemm<T, Context>(
+ CblasNoTrans,
+ CblasTrans,
+ output_image_size,
+ M,
+ kernel_dim,
+ 1,
+ col_buffer_data,
+ filter.template data<T>(),
+ 0,
+ Ydata,
+ &context_);
+ // Bias term
+ math::Gemm<T, Context>(
+ CblasNoTrans,
+ CblasNoTrans,
+ output_image_size,
+ M,
+ 1,
+ 1,
+ bias_multiplier_.template data<T>(),
+ bias.template data<T>(),
+ 1,
+ Ydata,
+ &context_);
+ Xdata += input_offset;
+ Ydata += output_offset;
+ }
+ };
+ if (FLAGS_caffe2_force_shared_col_buffer || shared_buffer_) {
+ runWithSharedBuffer<Context>(ws_, f);
+ } else {
+ f(&col_buffer_);
}
}
return true;
diff --git a/caffe2/operators/conv_op_shared.cc b/caffe2/operators/conv_op_shared.cc
new file mode 100644
index 0000000..acb29ec
--- /dev/null
+++ b/caffe2/operators/conv_op_shared.cc
@@ -0,0 +1,23 @@
+#include "conv_op_shared.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/flags.h"
+#include "caffe2/core/workspace.h"
+
+CAFFE2_DEFINE_bool(
+ caffe2_force_shared_col_buffer,
+ false,
+ "Always use the shared col buffer");
+
+namespace caffe2 {
+
+template <>
+void runWithSharedBuffer(
+ Workspace* ws,
+ std::function<void(Tensor<CPUContext>* buffer)> f) {
+ static std::mutex m;
+ std::lock_guard<std::mutex> g(m);
+ auto* buffer = ws->CreateBlob("__CAFFE2_SHARED_CONV_BUFFER_CPU__")
+ ->GetMutable<TensorCPU>();
+ f(buffer);
+}
+}
diff --git a/caffe2/operators/conv_op_shared.h b/caffe2/operators/conv_op_shared.h
new file mode 100644
index 0000000..939f590
--- /dev/null
+++ b/caffe2/operators/conv_op_shared.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/core/workspace.h"
+
+namespace caffe2 {
+
+template <typename Context>
+void runWithSharedBuffer(
+ Workspace* ws,
+ std::function<void(Tensor<Context>* buffer)> f);
+}
diff --git a/caffe2/operators/conv_op_shared_gpu.cc b/caffe2/operators/conv_op_shared_gpu.cc
new file mode 100644
index 0000000..eed549f
--- /dev/null
+++ b/caffe2/operators/conv_op_shared_gpu.cc
@@ -0,0 +1,16 @@
+#include "caffe2/core/context_gpu.h"
+#include "conv_op_shared.h"
+
+namespace caffe2 {
+
+template <>
+void runWithSharedBuffer(
+ Workspace* ws,
+ std::function<void(Tensor<CUDAContext>* buffer)> f) {
+ static std::mutex m;
+ std::lock_guard<std::mutex> g(m);
+ auto* buffer = ws->CreateBlob("__CAFFE2_SHARED_CONV_BUFFER_CUDA__")
+ ->GetMutable<TensorCUDA>();
+ f(buffer);
+}
+}
diff --git a/caffe2/operators/conv_pool_op_base.h b/caffe2/operators/conv_pool_op_base.h
index f465331..c83de71 100644
--- a/caffe2/operators/conv_pool_op_base.h
+++ b/caffe2/operators/conv_pool_op_base.h
@@ -56,7 +56,10 @@
"stride_w",
OperatorBase::GetSingleArgument<int>("stride", 1))),
order_(StringToStorageOrder(
- OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
+ OperatorBase::GetSingleArgument<string>("order", "NCHW"))),
+ shared_buffer_(
+ OperatorBase::GetSingleArgument<int>("shared_buffer", 0)),
+ ws_(ws) {
// For the padding, they should either be the legacy padding strategy
// (VALID or SAME), or an explicit, non-negative value.
if (legacy_pad_ == LegacyPadding::VALID ||
@@ -196,6 +199,8 @@
int stride_h_;
int stride_w_;
StorageOrder order_;
+ bool shared_buffer_;
+ Workspace* ws_;
inline void ComputeSizeAndPad(
const int in_size,
@@ -280,20 +285,22 @@
private:
};
-#define USE_CONV_POOL_BASE_FUNCTIONS(Context) \
- USE_OPERATOR_FUNCTIONS(Context); \
- using ConvPoolOpBase<Context>::pad_t_; \
- using ConvPoolOpBase<Context>::pad_l_; \
- using ConvPoolOpBase<Context>::pad_b_; \
- using ConvPoolOpBase<Context>::pad_r_; \
- using ConvPoolOpBase<Context>::legacy_pad_; \
- using ConvPoolOpBase<Context>::kernel_h_; \
- using ConvPoolOpBase<Context>::kernel_w_; \
- using ConvPoolOpBase<Context>::dilation_h_; \
- using ConvPoolOpBase<Context>::dilation_w_; \
- using ConvPoolOpBase<Context>::stride_h_; \
- using ConvPoolOpBase<Context>::stride_w_; \
- using ConvPoolOpBase<Context>::order_
+#define USE_CONV_POOL_BASE_FUNCTIONS(Context) \
+ USE_OPERATOR_FUNCTIONS(Context); \
+ using ConvPoolOpBase<Context>::pad_t_; \
+ using ConvPoolOpBase<Context>::pad_l_; \
+ using ConvPoolOpBase<Context>::pad_b_; \
+ using ConvPoolOpBase<Context>::pad_r_; \
+ using ConvPoolOpBase<Context>::legacy_pad_; \
+ using ConvPoolOpBase<Context>::kernel_h_; \
+ using ConvPoolOpBase<Context>::kernel_w_; \
+ using ConvPoolOpBase<Context>::dilation_h_; \
+ using ConvPoolOpBase<Context>::dilation_w_; \
+ using ConvPoolOpBase<Context>::stride_h_; \
+ using ConvPoolOpBase<Context>::stride_w_; \
+ using ConvPoolOpBase<Context>::order_; \
+ using ConvPoolOpBase<Context>::shared_buffer_; \
+ using ConvPoolOpBase<Context>::ws_
} // namespace caffe2
diff --git a/caffe2/operators/conv_transpose_op_cudnn.cc b/caffe2/operators/conv_transpose_op_cudnn.cc
index dbcef28..a292e49 100644
--- a/caffe2/operators/conv_transpose_op_cudnn.cc
+++ b/caffe2/operators/conv_transpose_op_cudnn.cc
@@ -45,7 +45,7 @@
deterministic_(
OperatorBase::GetSingleArgument<int>("deterministic", 0)),
cudnn_state_(OperatorBase::GetSingleArgument<int>("cudnn_state", 0)) {
- CHECK(!deterministic_ || !exhaustive_search_);
+ CAFFE_ENFORCE(!deterministic_ || !exhaustive_search_);
CUDNN_CHECK(cudnnCreateTensorDescriptor(&bottom_desc_));
CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc_));
diff --git a/caffe2/operators/conv_transpose_op_impl.h b/caffe2/operators/conv_transpose_op_impl.h
index ed5016e..64a573c 100644
--- a/caffe2/operators/conv_transpose_op_impl.h
+++ b/caffe2/operators/conv_transpose_op_impl.h
@@ -6,18 +6,21 @@
#include "caffe2/core/context.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/operator.h"
+#include "caffe2/operators/conv_op_shared.h"
#include "caffe2/operators/conv_transpose_op.h"
#include "caffe2/operators/conv_transpose_unpool_op_base.h"
#include "caffe2/utils/math.h"
+CAFFE2_DECLARE_bool(caffe2_force_shared_col_buffer);
+
namespace caffe2 {
template <typename T, class Context>
bool ConvTransposeOp<T, Context>::RunOnDeviceWithOrderNCHW() {
- auto& X = Input(INPUT);
+ const Tensor<Context>& X = Input(INPUT);
auto& filter = Input(FILTER);
auto& bias = Input(BIAS);
- auto* Y = Output(0);
+ Tensor<Context>* Y = Output(0);
const int N = X.dim32(0), M = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
CAFFE_ENFORCE(filter.ndim() == 4, "filter must be 4D tensor");
CAFFE_ENFORCE(
@@ -40,7 +43,6 @@
const int input_image_size = H * W;
const int output_image_size = Y->dim32(2) * Y->dim32(3);
- col_buffer_.Resize(vector<TIndex>{C, kernel_h_, kernel_w_, H, W});
if (bias_multiplier_.size() != output_image_size) {
bias_multiplier_.Resize(vector<TIndex>(1, output_image_size));
math::Set<T, Context>(
@@ -50,65 +52,74 @@
&context_);
}
const T* Xdata = X.template data<T>();
- T* col_buffer_data = col_buffer_.template mutable_data<T>();
T* Ydata = Y->template mutable_data<T>();
- for (auto image_id = 0; image_id < N; ++image_id) {
- // Weight term
- math::Gemm<T, Context>(
- CblasTrans,
- CblasNoTrans,
- kernel_dim,
- input_image_size,
- M,
- 1,
- filter.template data<T>(),
- Xdata,
- 0,
- col_buffer_data,
- &context_);
- // Col2im
- math::Col2im<T, Context, StorageOrder::NCHW>(
- col_buffer_data,
- C,
- Y->dim32(2),
- Y->dim32(3),
- kernel_h_,
- kernel_w_,
- 1,
- 1,
- pad_t_,
- pad_l_,
- pad_b_,
- pad_r_,
- stride_h_,
- stride_w_,
- Ydata,
- &context_);
- // Bias term
- math::Gemm<T, Context>(
- CblasNoTrans,
- CblasNoTrans,
- C,
- output_image_size,
- 1,
- 1,
- bias.template data<T>(),
- bias_multiplier_.template data<T>(),
- 1,
- Ydata,
- &context_);
- Xdata += M * H * W;
- Ydata += Y->size() / Y->dim32(0);
+
+ auto f = [&](Tensor<Context>* col_buffer) {
+ col_buffer->Resize(vector<TIndex>{C, kernel_h_, kernel_w_, H, W});
+ T* col_buffer_data = col_buffer->template mutable_data<T>();
+ for (auto image_id = 0; image_id < N; ++image_id) {
+ // Weight term
+ math::Gemm<T, Context>(
+ CblasTrans,
+ CblasNoTrans,
+ kernel_dim,
+ input_image_size,
+ M,
+ 1,
+ filter.template data<T>(),
+ Xdata,
+ 0,
+ col_buffer_data,
+ &context_);
+ // Col2im
+ math::Col2im<T, Context, StorageOrder::NCHW>(
+ col_buffer_data,
+ C,
+ Y->dim32(2),
+ Y->dim32(3),
+ kernel_h_,
+ kernel_w_,
+ 1,
+ 1,
+ pad_t_,
+ pad_l_,
+ pad_b_,
+ pad_r_,
+ stride_h_,
+ stride_w_,
+ Ydata,
+ &context_);
+ // Bias term
+ math::Gemm<T, Context>(
+ CblasNoTrans,
+ CblasNoTrans,
+ C,
+ output_image_size,
+ 1,
+ 1,
+ bias.template data<T>(),
+ bias_multiplier_.template data<T>(),
+ 1,
+ Ydata,
+ &context_);
+ Xdata += M * H * W;
+ Ydata += Y->size() / Y->dim32(0);
+ }
+ };
+ if (FLAGS_caffe2_force_shared_col_buffer || shared_buffer_) {
+ runWithSharedBuffer<Context>(ws_, f);
+ } else {
+ f(&col_buffer_);
}
return true;
}
template <typename T, class Context>
bool ConvTransposeOp<T, Context>::RunOnDeviceWithOrderNHWC() {
- auto& X = Input(INPUT);
+ const Tensor<Context>& X = Input(INPUT);
auto& filter = Input(FILTER);
auto& bias = Input(BIAS);
- auto* Y = Output(0);
+ Tensor<Context>* Y = Output(0);
const auto N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), M = X.dim32(3);
CAFFE_ENFORCE(filter.ndim() == 4, "filter must be 4D tensor");
CAFFE_ENFORCE(
@@ -131,7 +142,6 @@
const auto input_image_size = H * W;
const auto output_image_size = Y->dim32(1) * Y->dim32(2);
- col_buffer_.Resize(vector<TIndex>{H, W, kernel_h_, kernel_w_, C});
if (bias_multiplier_.size() != output_image_size) {
bias_multiplier_.Resize(vector<TIndex>(1, output_image_size));
math::Set<T, Context>(
@@ -141,55 +151,64 @@
&context_);
}
const T* Xdata = X.template data<T>();
- T* col_buffer_data = col_buffer_.template mutable_data<T>();
T* Ydata = Y->template mutable_data<T>();
- for (auto image_id = 0; image_id < N; ++image_id) {
- // Weight term
- math::Gemm<T, Context>(
- CblasNoTrans,
- CblasNoTrans,
- input_image_size,
- kernel_dim,
- M,
- 1,
- Xdata,
- filter.template data<T>(),
- 0,
- col_buffer_data,
- &context_);
- // Col2im
- math::Col2im<T, Context, StorageOrder::NHWC>(
- col_buffer_data,
- C,
- Y->dim32(1),
- Y->dim32(2),
- kernel_h_,
- kernel_w_,
- 1,
- 1,
- pad_t_,
- pad_l_,
- pad_b_,
- pad_r_,
- stride_h_,
- stride_w_,
- Ydata,
- &context_);
- // Bias term
- math::Gemm<T, Context>(
- CblasNoTrans,
- CblasNoTrans,
- output_image_size,
- C,
- 1,
- 1,
- bias_multiplier_.template data<T>(),
- bias.template data<T>(),
- 1,
- Ydata,
- &context_);
- Xdata += M * H * W;
- Ydata += Y->size() / Y->dim32(0);
+
+ auto f = [&](Tensor<Context>* col_buffer) {
+ col_buffer_.Resize(vector<TIndex>{H, W, kernel_h_, kernel_w_, C});
+ T* col_buffer_data = col_buffer_.template mutable_data<T>();
+ for (auto image_id = 0; image_id < N; ++image_id) {
+ // Weight term
+ math::Gemm<T, Context>(
+ CblasNoTrans,
+ CblasNoTrans,
+ input_image_size,
+ kernel_dim,
+ M,
+ 1,
+ Xdata,
+ filter.template data<T>(),
+ 0,
+ col_buffer_data,
+ &context_);
+ // Col2im
+ math::Col2im<T, Context, StorageOrder::NHWC>(
+ col_buffer_data,
+ C,
+ Y->dim32(1),
+ Y->dim32(2),
+ kernel_h_,
+ kernel_w_,
+ 1,
+ 1,
+ pad_t_,
+ pad_l_,
+ pad_b_,
+ pad_r_,
+ stride_h_,
+ stride_w_,
+ Ydata,
+ &context_);
+ // Bias term
+ math::Gemm<T, Context>(
+ CblasNoTrans,
+ CblasNoTrans,
+ output_image_size,
+ C,
+ 1,
+ 1,
+ bias_multiplier_.template data<T>(),
+ bias.template data<T>(),
+ 1,
+ Ydata,
+ &context_);
+ Xdata += M * H * W;
+ Ydata += Y->size() / Y->dim32(0);
+ }
+ };
+ if (FLAGS_caffe2_force_shared_col_buffer || shared_buffer_) {
+ runWithSharedBuffer<Context>(ws_, f);
+ } else {
+ f(&col_buffer_);
}
return true;
}
diff --git a/caffe2/operators/conv_transpose_unpool_op_base.h b/caffe2/operators/conv_transpose_unpool_op_base.h
index 59aad86..675c150 100644
--- a/caffe2/operators/conv_transpose_unpool_op_base.h
+++ b/caffe2/operators/conv_transpose_unpool_op_base.h
@@ -44,7 +44,10 @@
"adj_w",
OperatorBase::GetSingleArgument<int>("adj", 0))),
order_(StringToStorageOrder(
- OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
+ OperatorBase::GetSingleArgument<string>("order", "NCHW"))),
+ shared_buffer_(
+ OperatorBase::GetSingleArgument<int>("shared_buffer", 0)),
+ ws_(ws) {
CAFFE_ENFORCE(kernel_h_ > 0);
CAFFE_ENFORCE(kernel_w_ > 0);
// For the padding, they should either be the legacy padding strategy
@@ -151,6 +154,8 @@
int adj_h_;
int adj_w_;
StorageOrder order_;
+ bool shared_buffer_;
+ Workspace* ws_;
inline void ComputeSizeAndPad(
const int in_size,
@@ -182,17 +187,19 @@
}
};
-#define USE_CONV_TRANSPOSE_UNPOOL_BASE_FUNCTIONS \
- USE_OPERATOR_CONTEXT_FUNCTIONS; \
- using ConvTransposeUnpoolBase<Context>::pad_t_; \
- using ConvTransposeUnpoolBase<Context>::pad_b_; \
- using ConvTransposeUnpoolBase<Context>::pad_l_; \
- using ConvTransposeUnpoolBase<Context>::pad_r_; \
- using ConvTransposeUnpoolBase<Context>::kernel_h_; \
- using ConvTransposeUnpoolBase<Context>::kernel_w_; \
- using ConvTransposeUnpoolBase<Context>::stride_h_; \
- using ConvTransposeUnpoolBase<Context>::stride_w_; \
- using ConvTransposeUnpoolBase<Context>::order_
+#define USE_CONV_TRANSPOSE_UNPOOL_BASE_FUNCTIONS \
+ USE_OPERATOR_CONTEXT_FUNCTIONS; \
+ using ConvTransposeUnpoolBase<Context>::pad_t_; \
+ using ConvTransposeUnpoolBase<Context>::pad_b_; \
+ using ConvTransposeUnpoolBase<Context>::pad_l_; \
+ using ConvTransposeUnpoolBase<Context>::pad_r_; \
+ using ConvTransposeUnpoolBase<Context>::kernel_h_; \
+ using ConvTransposeUnpoolBase<Context>::kernel_w_; \
+ using ConvTransposeUnpoolBase<Context>::stride_h_; \
+ using ConvTransposeUnpoolBase<Context>::stride_w_; \
+ using ConvTransposeUnpoolBase<Context>::order_; \
+ using ConvTransposeUnpoolBase<Context>::shared_buffer_; \
+ using ConvTransposeUnpoolBase<Context>::ws_
} // namespace caffe2
diff --git a/caffe2/operators/counter_ops.cc b/caffe2/operators/counter_ops.cc
index 8ce88f8..511fa97 100644
--- a/caffe2/operators/counter_ops.cc
+++ b/caffe2/operators/counter_ops.cc
@@ -8,6 +8,9 @@
REGISTER_CPU_OPERATOR(CreateCounter, CreateCounterOp<int64_t, CPUContext>);
REGISTER_CPU_OPERATOR(ResetCounter, ResetCounterOp<int64_t, CPUContext>);
REGISTER_CPU_OPERATOR(CountDown, CountDownOp<int64_t, CPUContext>);
+REGISTER_CPU_OPERATOR(
+ CheckCounterDone,
+ CheckCounterDoneOp<int64_t, CPUContext>);
REGISTER_CPU_OPERATOR(CountUp, CountUpOp<int64_t, CPUContext>);
REGISTER_CPU_OPERATOR(RetrieveCount, RetrieveCountOp<int64_t, CPUContext>);
@@ -41,6 +44,15 @@
.Input(0, "counter", "A blob pointing to an instance of a counter.")
.Output(0, "done", "false unless the internal count is zero.");
+OPERATOR_SCHEMA(CheckCounterDone)
+ .NumInputs(1)
+ .NumOutputs(1)
+ .SetDoc(R"DOC(
+If the internal count value <= 0, outputs true, otherwise outputs false,
+)DOC")
+ .Input(0, "counter", "A blob pointing to an instance of a counter.")
+ .Output(0, "done", "true if the internal count is zero or negative.");
+
OPERATOR_SCHEMA(CountUp)
.NumInputs(1)
.NumOutputs(1)
@@ -67,4 +79,6 @@
} // namespace
+CAFFE_KNOWN_TYPE(std::unique_ptr<Counter<int64_t>>);
+
} // namespace caffe2
diff --git a/caffe2/operators/counter_ops.h b/caffe2/operators/counter_ops.h
index 73a91e0..cd939b2 100644
--- a/caffe2/operators/counter_ops.h
+++ b/caffe2/operators/counter_ops.h
@@ -8,7 +8,6 @@
#include "caffe2/core/operator.h"
namespace caffe2 {
-namespace {
template <typename T>
class Counter {
public:
@@ -28,6 +27,10 @@
return count_.load();
}
+ T checkIfDone() const {
+ return (count_.load() <= 0);
+ }
+
void reset(T init_count) {
count_ = init_count;
}
@@ -35,7 +38,6 @@
private:
std::atomic<T> count_;
};
-}
// TODO(jiayq): deprecate these ops & consolidate them with IterOp/AtomicIterOp
@@ -98,6 +100,23 @@
// Will always use TensorCPU regardless the Context
template <typename T, class Context>
+class CheckCounterDoneOp final : public Operator<Context> {
+ public:
+ USE_OPERATOR_CONTEXT_FUNCTIONS;
+ CheckCounterDoneOp(const OperatorDef& operator_def, Workspace* ws)
+ : Operator<Context>(operator_def, ws) {}
+
+ bool RunOnDevice() override {
+ auto& counterPtr = OperatorBase::Input<std::unique_ptr<Counter<T>>>(0);
+ auto* output = OperatorBase::Output<TensorCPU>(0);
+ output->Resize(std::vector<int>{});
+ *output->template mutable_data<bool>() = counterPtr->checkIfDone();
+ return true;
+ }
+};
+
+// Will always use TensorCPU regardless the Context
+template <typename T, class Context>
class CountUpOp final : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
diff --git a/caffe2/operators/counter_ops_gpu.cc b/caffe2/operators/counter_ops_gpu.cc
index cda6740..de07e02 100644
--- a/caffe2/operators/counter_ops_gpu.cc
+++ b/caffe2/operators/counter_ops_gpu.cc
@@ -6,6 +6,9 @@
REGISTER_CUDA_OPERATOR(CreateCounter, CreateCounterOp<int64_t, CUDAContext>);
REGISTER_CUDA_OPERATOR(ResetCounter, ResetCounterOp<int64_t, CUDAContext>);
REGISTER_CUDA_OPERATOR(CountDown, CountDownOp<int64_t, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+ CheckCounterDone,
+ CheckCounterDoneOp<int64_t, CUDAContext>);
REGISTER_CUDA_OPERATOR(CountUp, CountUpOp<int64_t, CUDAContext>);
REGISTER_CUDA_OPERATOR(RetrieveCount, RetrieveCountOp<int64_t, CUDAContext>);
} // namespace
diff --git a/caffe2/operators/dataset_ops.cc b/caffe2/operators/dataset_ops.cc
index 831530a..2b1b0e1 100644
--- a/caffe2/operators/dataset_ops.cc
+++ b/caffe2/operators/dataset_ops.cc
@@ -634,6 +634,144 @@
}
};
+template <class Context>
+using TensorVectorPtr = std::unique_ptr<std::vector<Tensor<Context>>>;
+
+template <class Context>
+class CreateTensorVectorOp final : public Operator<Context> {
+ public:
+ USE_OPERATOR_CONTEXT_FUNCTIONS;
+ using Operator<Context>::Operator;
+
+ bool RunOnDevice() override {
+ auto ptr = std::make_unique<std::vector<Tensor<Context>>>();
+ *OperatorBase::Output<TensorVectorPtr<Context>>(TENSOR_VECTOR) =
+ std::move(ptr);
+ return true;
+ }
+
+ private:
+ OUTPUT_TAGS(TENSOR_VECTOR);
+};
+
+template <class Context>
+class ConcatTensorVectorOp final : public Operator<Context> {
+ public:
+ USE_OPERATOR_CONTEXT_FUNCTIONS;
+ using Operator<Context>::Operator;
+
+ bool RunOnDevice() override {
+ const TensorVectorPtr<Context>& tensorVector =
+ OperatorBase::Input<TensorVectorPtr<Context>>(TENSOR_VECTOR);
+
+ auto* tensor = Output(TENSOR);
+ CAFFE_ENFORCE(!tensorVector->empty());
+
+ vector<TIndex> outputDims(tensorVector->at(0).dims());
+ CAFFE_ENFORCE(outputDims.size() > 0);
+ for (int i = 1; i < tensorVector->size(); i++) {
+ // the tensor shapes are the same except for the first dimension
+ for (int j = 1; j < tensorVector->at(i).ndim(); j++) {
+ CAFFE_ENFORCE(outputDims[j] == tensorVector->at(i).dims()[j]);
+ }
+ CAFFE_ENFORCE(tensorVector->at(0).meta() == tensorVector->at(i).meta());
+ outputDims[0] += tensorVector->at(i).dims()[0];
+ }
+
+ tensor->Resize(outputDims);
+ TIndex offset = 0;
+ auto* dst = (char*)tensor->raw_mutable_data(tensorVector->at(0).meta());
+
+ for (const auto& t : *tensorVector) {
+ context_.template CopyItems<Context, Context>(
+ t.meta(), t.size(), t.raw_data(), dst + offset);
+ offset += t.nbytes();
+ }
+
+ return true;
+ }
+
+ private:
+ INPUT_TAGS(TENSOR_VECTOR);
+ OUTPUT_TAGS(TENSOR);
+};
+
+template <class Context>
+class CollectTensorOp final : public Operator<Context> {
+ public:
+ USE_OPERATOR_CONTEXT_FUNCTIONS;
+ CollectTensorOp(const OperatorDef operator_def, Workspace* ws)
+ : Operator<Context>(operator_def, ws),
+ numToCollect_(
+ OperatorBase::GetSingleArgument<int>("num_to_collect", -1)),
+ numVisited_(0) {
+ CAFFE_ENFORCE(numToCollect_ > 0);
+ }
+
+ bool RunOnDevice() override {
+ // TENSOR_VECTOR_IN is enforced inplace with TENSOR_VECTOR_OUT
+ TensorVectorPtr<Context>& tensorVector =
+ *OperatorBase::Output<TensorVectorPtr<Context>>(TENSOR_VECTOR_OUT);
+
+ auto* position_out = Output(POSITION_OUT);
+ const auto& tensor = Input(TENSOR_TO_COLLECT);
+
+ int pos = -1;
+ if (InputSize() >= 3) {
+ CAFFE_ENFORCE(0 == Input(POSITION_IN).ndim());
+ pos = Input(POSITION_IN).template data<int>()[0];
+ } else {
+ if (numVisited_ < numToCollect_) {
+ // append
+ pos = tensorVector->size();
+ } else {
+ CAFFE_ENFORCE(
+ tensorVector->size() == numToCollect_,
+ "TensorVecotor size = ",
+ tensorVector->size(),
+ " is different from numToCollect = ",
+ numToCollect_);
+ auto& gen = context_.RandGenerator();
+ // uniform between [0, numVisited_]
+ std::uniform_int_distribution<int> uniformDist(0, numVisited_);
+ pos = uniformDist(gen);
+ if (pos >= numToCollect_) {
+ // discard
+ pos = -1;
+ }
+ }
+ }
+
+ if (pos < 0) {
+ // discard
+ CAFFE_ENFORCE(numVisited_ >= numToCollect_);
+ } else if (pos >= tensorVector->size()) {
+ // append
+ tensorVector->push_back(Tensor<Context>());
+ tensorVector->back().template CopyFrom<Context, Context>(
+ tensor, &context_);
+ } else {
+ // replace
+ tensorVector->at(pos).template CopyFrom<Context, Context>(
+ tensor, &context_);
+ }
+
+ position_out->Resize(vector<TIndex>());
+ position_out->template mutable_data<int>()[0] = pos;
+
+ numVisited_++;
+ return true;
+ }
+
+ private:
+ // number of tensors to collect
+ int numToCollect_;
+ // number of tensors visited
+ int numVisited_;
+ INPUT_TAGS(TENSOR_VECTOR_IN, TENSOR_TO_COLLECT, POSITION_IN);
+ OUTPUT_TAGS(TENSOR_VECTOR_OUT, POSITION_OUT);
+};
+
REGISTER_CPU_OPERATOR(CreateTreeCursor, CreateTreeCursorOp);
REGISTER_CPU_OPERATOR(ResetCursor, ResetCursorOp);
REGISTER_CPU_OPERATOR(ReadNextBatch, ReadNextBatchOp);
@@ -643,6 +781,9 @@
REGISTER_CPU_OPERATOR(CheckDatasetConsistency, CheckDatasetConsistencyOp);
REGISTER_CPU_OPERATOR(Append, AppendOp<CPUContext>);
REGISTER_CPU_OPERATOR(AtomicAppend, AtomicAppendOp<CPUContext>);
+REGISTER_CPU_OPERATOR(CreateTensorVector, CreateTensorVectorOp<CPUContext>);
+REGISTER_CPU_OPERATOR(ConcatTensorVector, ConcatTensorVectorOp<CPUContext>);
+REGISTER_CPU_OPERATOR(CollectTensor, CollectTensorOp<CPUContext>);
OPERATOR_SCHEMA(CreateTreeCursor)
.NumInputs(0)
@@ -850,6 +991,45 @@
.NumOutputs(1, INT_MAX)
.AllowInplace([](int in, int out) { return in == out + 1; });
+OPERATOR_SCHEMA(CreateTensorVector)
+ .NumInputs(0)
+ .NumOutputs(1)
+ .SetDoc("Create a std::unique_ptr<std::vector<Tensor> >");
+
+OPERATOR_SCHEMA(ConcatTensorVector)
+ .NumInputs(1)
+ .NumOutputs(1)
+ .SetDoc(R"DOC(
+Concat Tensors in the std::unique_ptr<std::vector<Tensor> >
+along the first dimension.
+ )DOC")
+ .Input(0, "vector of Tensor", "std::unique_ptr<std::vector<Tensor> >")
+ .Output(0, "tensor", "tensor after concatenating");
+
+OPERATOR_SCHEMA(CollectTensor)
+ .NumInputs(2, 3)
+ .NumOutputs(2)
+ .EnforceInplace({{0, 0}})
+ .AllowInplace({{2, 1}})
+ .SetDoc(R"DOC(
+Collect tensor into tensor vector by reservoir sampling,
+argument num_to_collect indicates the max number of tensors that will be
+collcted
+ )DOC")
+ .Arg("num_to_collect", "The max number of tensors to collect")
+ .Input(0, "input tensor vector", "tensor vector with collected tensors")
+ .Input(1, "tensor", "new tensor will be collected by reservoir sampling")
+ .Input(2, "input position", R"DOC(
+if provided, new tensor will be collected in the way indicated by position.
+e.g. if position < 0, discard the new tensor, if position == k and k < the size
+of input tensor vector, replace the tensor at position k with the new tensor.
+ )DOC")
+ .Output(0, "output tensor vector", "enforce inplace with input 0")
+ .Output(1, "output position", R"DOC(
+record the position at which the new tensor was collcted,
+position < 0 means it's discarded.
+ )DOC");
+
SHOULD_NOT_DO_GRADIENT(CreateTreeCursor);
SHOULD_NOT_DO_GRADIENT(ResetCursor);
SHOULD_NOT_DO_GRADIENT(ReadNextBatch);
@@ -858,5 +1038,10 @@
SHOULD_NOT_DO_GRADIENT(CheckDatasetConsistency);
SHOULD_NOT_DO_GRADIENT(Append);
SHOULD_NOT_DO_GRADIENT(AtomicAppend);
-}
-}
+SHOULD_NOT_DO_GRADIENT(CreateTensorVector);
+SHOULD_NOT_DO_GRADIENT(ConcatTensorVector);
+SHOULD_NOT_DO_GRADIENT(CollectTensor);
+} // namespace
+CAFFE_KNOWN_TYPE(std::unique_ptr<TreeCursor>);
+CAFFE_KNOWN_TYPE(TensorVectorPtr<CPUContext>);
+} // caffe2
diff --git a/caffe2/operators/filler_op.h b/caffe2/operators/filler_op.h
index 02d3d90..9098ee8 100644
--- a/caffe2/operators/filler_op.h
+++ b/caffe2/operators/filler_op.h
@@ -203,7 +203,7 @@
bool Fill(Tensor<Context>* output) override {
const int fan_in = output->size() / output->dim32(0);
- T scale = sqrt(T(3) / fan_in);
+ T scale = std::sqrt(T(3) / fan_in);
math::RandUniform<T, Context>(
output->size(), -scale, scale,
output->template mutable_data<T>(), &context_);
@@ -221,7 +221,7 @@
bool Fill(Tensor<Context>* output) override {
const int fan_in = output->size() / output->dim32(0);
- T scale = sqrt(T(2) / fan_in);
+ T scale = std::sqrt(T(2) / fan_in);
math::RandUniform<T, Context>(
output->size(), -scale, scale,
output->template mutable_data<T>(), &context_);
diff --git a/caffe2/operators/index_ops.cc b/caffe2/operators/index_ops.cc
index 5df4ce6..575b1e5 100644
--- a/caffe2/operators/index_ops.cc
+++ b/caffe2/operators/index_ops.cc
@@ -53,10 +53,10 @@
explicit Index(TIndexValue maxElements)
: IndexBase(maxElements, TypeMeta::Make<T>()) {}
- bool Get(const T* keys, TIndexValue* values, size_t numKeys) {
+ void Get(const T* keys, TIndexValue* values, size_t numKeys) {
if (frozen_) {
FrozenGet(keys, values, numKeys);
- return true;
+ return;
}
std::lock_guard<std::mutex> lock(dictMutex_);
for (int i = 0; i < numKeys; ++i) {
@@ -68,10 +68,9 @@
dict_.insert({keys[i], newValue});
values[i] = newValue;
} else {
- return false;
+ CAFFE_THROW("Dict max size reached");
}
}
- return true;
}
bool Load(const T* keys, size_t numKeys) {
@@ -152,8 +151,8 @@
const auto& keys = Input(1);
auto* values = Output(0);
values->ResizeLike(keys);
- return dict->Get(
- keys.data<T>(), values->mutable_data<TIndexValue>(), keys.size());
+ dict->Get(keys.data<T>(), values->mutable_data<TIndexValue>(), keys.size());
+ return true;
}
};
@@ -431,6 +430,8 @@
}
};
+CAFFE_KNOWN_TYPE(std::unique_ptr<caffe2::IndexBase>);
+
REGISTER_BLOB_SERIALIZER(
(TypeMeta::Id<std::unique_ptr<caffe2::IndexBase>>()),
IndexSerializer);
diff --git a/caffe2/operators/load_save_op.h b/caffe2/operators/load_save_op.h
index 8a750a9..fc1e23e 100644
--- a/caffe2/operators/load_save_op.h
+++ b/caffe2/operators/load_save_op.h
@@ -80,7 +80,7 @@
VLOG(2) << "Deserializing blob " << key;
BlobProto proto;
- CHECK(proto.ParseFromString(cursor->value()));
+ CAFFE_ENFORCE(proto.ParseFromString(cursor->value()));
if (!keep_device_) {
// If we are not keeping the device as the one specified in the
// proto, we will set the current device.
@@ -97,7 +97,7 @@
// different GPU.
blob->Reset();
}
- CHECK(blob->Deserialize(proto));
+ CAFFE_ENFORCE(blob->Deserialize(proto));
if (!blob->IsType<Tensor<Context>>()) {
// Deal with non-tensors: we don't support chunking so we're done.
@@ -110,7 +110,7 @@
blobSize.first->second += proto.tensor().segment().end() -
proto.tensor().segment().begin();
} else {
- CHECK(blobSize.first->second == 0);
+ CAFFE_ENFORCE(blobSize.first->second == 0);
blobSize.first->second = tensorSize;
}
if (blobSize.first->second >= tensorSize) {
@@ -137,7 +137,15 @@
}
}
- CHECK_EQ(loaded.size(), OutputSize());
+ if (loaded.size() != OutputSize()) {
+ for (const string& output_name : this->def().output()) {
+ if (loaded.count(output_name) <= 0) {
+ LOG(ERROR) << "Failed to load blob: " << output_name;
+ }
+ }
+ CAFFE_THROW(
+ "Expected to load ", OutputSize(), " blobs, ", "got ", loaded.size());
+ }
}
private:
diff --git a/caffe2/operators/lstm_unit_op.h b/caffe2/operators/lstm_unit_op.h
index d4a77a7..97eced9 100644
--- a/caffe2/operators/lstm_unit_op.h
+++ b/caffe2/operators/lstm_unit_op.h
@@ -29,7 +29,7 @@
T* H,
Context* context) {
for (int n = 0; n < N; ++n) {
- const bool valid = seqLengths[n] < t;
+ const bool valid = t < seqLengths[n];
for (int d = 0; d < D; ++d) {
if (!valid) {
H[d] = 0;
@@ -69,7 +69,7 @@
T* X_diff,
Context* context) {
for (int n = 0; n < N; ++n) {
- const bool valid = seqLengths[n] < t;
+ const bool valid = t < seqLengths[n];
for (int d = 0; d < D; ++d) {
T* c_prev_diff = C_prev_diff + d;
T* i_diff = X_diff + d;
diff --git a/caffe2/operators/lstm_unit_op_gpu.cu b/caffe2/operators/lstm_unit_op_gpu.cu
index 2dae099..b21a62d 100644
--- a/caffe2/operators/lstm_unit_op_gpu.cu
+++ b/caffe2/operators/lstm_unit_op_gpu.cu
@@ -31,7 +31,7 @@
CUDA_1D_KERNEL_LOOP(index, nthreads) {
const int n = index / dim;
const int d = index % dim;
- const bool valid = seqLengths[n] < t;
+ const bool valid = t < seqLengths[n];
if (!valid) {
H[index] = 0;
C[index] = C_prev[index];
@@ -66,7 +66,7 @@
T* X_diff) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
const int n = index / dim;
- const bool valid = seqLengths[n] < t;
+ const bool valid = t < seqLengths[n];
const int d = index % dim;
const T* X_offset = X + 4 * dim * n;
T* c_prev_diff = C_prev_diff + index;
diff --git a/caffe2/operators/matmul_op.cu b/caffe2/operators/matmul_op_gpu.cc
similarity index 100%
rename from caffe2/operators/matmul_op.cu
rename to caffe2/operators/matmul_op_gpu.cc
diff --git a/caffe2/operators/pack_segments.cc b/caffe2/operators/pack_segments.cc
index 3d6227a..55333d3 100644
--- a/caffe2/operators/pack_segments.cc
+++ b/caffe2/operators/pack_segments.cc
@@ -14,9 +14,21 @@
class PackSegmentsOp final : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
- USE_SIMPLE_CTOR_DTOR(PackSegmentsOp)
+ // USE_SIMPLE_CTOR_DTOR(PackSegmentsOp)
USE_DISPATCH_HELPER;
+ PackSegmentsOp(const OperatorDef& operator_def, Workspace* ws)
+ : Operator<Context>(operator_def, ws),
+ pad_minf_(
+ OperatorBase::GetSingleArgument<bool>("pad_minf", false)) {
+ if (pad_minf_) {
+ padding_ = -1.0 * std::numeric_limits<float>::infinity();
+ } else {
+ padding_ = 0;
+ }
+ }
+
+
bool RunOnDevice() override {
return DispatchHelper<TensorTypes<int, long>>::call(this, Input(LENGTHS));
}
@@ -30,17 +42,22 @@
CAFFE_ENFORCE(data.ndim() >= 1, "DATA should be at least 1-D");
CAFFE_ENFORCE(lengths.ndim() == 1, "LENGTH should be 1-D");
+ // Find the length of the longest sequence.
const T* l = lengths.template data<T>();
- T max_l = l[0];
+ T max_length = l[0];
for (T i = 1; i < lengths.dim(0); ++i) {
- max_l = std::max(max_l, l[i]);
+ max_length = std::max(max_length, l[i]);
}
- auto shape = data.dims();
- shape[0] = max_l;
+ auto shape = data.dims(); // Shape of output is batch_size x max_len x ...
+ shape[0] = max_length;
shape.insert(shape.begin(), lengths.size());
output->Resize(shape);
+ // Do zero padding
+ float* data_ptr = output->template mutable_data<float>();
+ memset(data_ptr, padding_, sizeof(float) * output->size());
+
int block_size = data.size() / data.dim(0);
int block_bytesize = data.nbytes() / data.dim(0);
const auto* d = static_cast<const char*>(data.raw_data());
@@ -51,13 +68,17 @@
data.meta(),
l[i] * block_size,
d + block_bytesize * start,
- out + block_bytesize * max_l * i);
+ out + block_bytesize * max_length * i);
start += l[i];
}
+
return true;
}
INPUT_TAGS(LENGTHS, DATA);
+ private:
+ bool pad_minf_;
+ float padding_;
};
template <class Context>
@@ -82,9 +103,9 @@
const T* l = lengths.template data<T>();
- T max_l = l[0];
+ T max_length = l[0];
for (T i = 1; i < lengths.dim(0); ++i) {
- max_l = std::max(max_l, l[i]);
+ max_length = std::max(max_length, l[i]);
}
T total_l = std::accumulate(l, l + lengths.dim(0), 0);
@@ -119,7 +140,9 @@
OPERATOR_SCHEMA(PackSegments)
.NumInputs(2)
.NumOutputs(1)
- .SetDoc("Map N dim tensor to N+1 dim based on length blob")
+ .SetDoc(
+ "Map N dim tensor to N+1 dim based on length blob. Sequences that \
+ are shorter than the longest sequence are padded with zeros.")
.Input(
0,
"lengths",
@@ -130,7 +153,10 @@
"packed_tensor",
"N + 1 dim Tesor"
"where dim(1) is the max length"
- ", dim(0) is the batch size.");
+ ", dim(0) is the batch size.")
+ .Arg(
+ "pad_minf", "Padding number in the packed segments. Use true to pad \
+ -infinity, otherwise pad zeros");
OPERATOR_SCHEMA(UnpackSegments)
.NumInputs(2)
.NumOutputs(1)
diff --git a/caffe2/operators/packed_fc_op.cc b/caffe2/operators/packed_fc_op.cc
new file mode 100644
index 0000000..a4d392a
--- /dev/null
+++ b/caffe2/operators/packed_fc_op.cc
@@ -0,0 +1,139 @@
+#include <cstdint>
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/mkl_utils.h"
+
+#ifdef CAFFE2_HAS_MKL_SGEMM_PACK
+
+namespace caffe2 {
+
+CAFFE_KNOWN_TYPE(mkl::MKLPackedMatrix);
+
+namespace mkl {
+
+class PackedFCOp final : public Operator<CPUContext> {
+ public:
+ USE_OPERATOR_FUNCTIONS(CPUContext);
+ PackedFCOp(const OperatorDef& operator_def, Workspace* ws)
+ : Operator<CPUContext>(operator_def, ws),
+ axis_(OperatorBase::GetSingleArgument<int32_t>("axis", 1)) {}
+ ~PackedFCOp() {}
+
+ bool RunOnDevice() override {
+ const auto& X = Input(0);
+ const auto& b = Input(2);
+ auto* Y = Output(0);
+ CAFFE_ENFORCE(b.ndim() == 1, b.ndim());
+ // batch size
+ const auto canonical_axis = X.canonical_axis_index(axis_);
+ const int M = X.size_to_dim(canonical_axis);
+ const int K = X.size_from_dim(canonical_axis);
+ const int N = b.size();
+
+ // Check out what is the passed in format.
+ const MKLPackedMatrix* packed_matrix = nullptr;
+ if (OperatorBase::InputIsType<TensorCPU>(1)) {
+ const auto& W = Input(1);
+ CAFFE_ENFORCE_EQ(W.ndim(), 2);
+ CAFFE_ENFORCE_EQ(W.dim32(0), N);
+ CAFFE_ENFORCE_EQ(W.dim32(1), K);
+ // Note(jiayq): This will strictly check that we have a proper usage of
+ // the PackedFC operator. The motivation is that, this operator is
+ // stateful unlike most ops in Caffe2, but checking whether the weight
+ // has changed matters quite a lot in the critical path. We only enable
+ // this test during DEBUG mode for performance considerations.
+ DCHECK(hash_ == 0 || hash_ == Hash(W.template data<float>(), W.size()))
+ << "PackedFCOp is currently stateful: you should not change the "
+ "weight during runtime. This is only sanity-checked in debug "
+ "mode for speed considerations.";
+ if (!local_packed_matrix_.get() || local_packed_matrix_->n_ != M) {
+ // If there is no pre packed matrix, or the batch size changed, we
+ // do a re-pack.
+ // Note that the packed sgemm follows the blas interfaces, not cblas
+ local_packed_matrix_.reset(new MKLPackedMatrix(
+ 'A', 'T', N, M, K, 1.f, W.template data<float>(), K));
+ }
+ packed_matrix = local_packed_matrix_.get();
+ } else if (OperatorBase::InputIsType<MKLPackedMatrix>(1)) {
+ packed_matrix = &OperatorBase::Input<MKLPackedMatrix>(1);
+ }
+ CAFFE_ENFORCE_EQ(packed_matrix->m_, N);
+ CAFFE_ENFORCE_EQ(packed_matrix->k_, K);
+ CAFFE_ENFORCE_EQ(packed_matrix->n_, M);
+ // Do we want to check the other flags as well?
+
+ Y->Resize(M, N);
+
+ const float kZero = 0;
+ sgemm_compute(
+ "P",
+ "N",
+ &N,
+ &M,
+ &K,
+ packed_matrix->data_,
+ &K,
+ X.template data<float>(),
+ &K,
+ &kZero,
+ Y->template mutable_data<float>(),
+ &N);
+
+ // Add bias term
+ if (bias_multiplier_.size() != M) {
+ // If the helper bias multiplier is not M, reshape and fill it with one.
+ bias_multiplier_.Resize(M);
+ math::Set<float, CPUContext>(
+ M, 1.f, bias_multiplier_.template mutable_data<float>(), &context_);
+ }
+ math::Gemm<float, CPUContext>(
+ CblasNoTrans,
+ CblasNoTrans,
+ M,
+ N,
+ 1,
+ 1,
+ bias_multiplier_.template data<float>(),
+ b.template data<float>(),
+ 1,
+ Y->template mutable_data<float>(),
+ &context_);
+ return true;
+ }
+
+ protected:
+ uint32_t Hash(const float* ptr, size_t n) {
+ uint32_t hash = 0;
+ const uint32_t* ptr_i = reinterpret_cast<const uint32_t*>(ptr);
+ for (int i = 0; i < n; ++i) {
+ hash ^= ptr_i[i];
+ }
+ return hash;
+ }
+ size_t axis_{1};
+ uint32_t hash_{0};
+ Tensor<CPUContext> bias_multiplier_;
+ std::unique_ptr<MKLPackedMatrix> local_packed_matrix_;
+};
+
+} // namespace mkl
+
+REGISTER_CPU_OPERATOR(PackedFC, mkl::PackedFCOp);
+
+OPERATOR_SCHEMA(PackedFC).NumInputs(3).NumOutputs(1).SetDoc(R"DOC(
+Computes the result of passing an input vector X into a fully connected
+layer with 2D weight matrix W and 1D bias vector b. This is essentially the
+same as the FC operator but allows one to pack the weight matrix for more
+efficient inference. See the schema for the FC op for details.
+
+Unlike many other operators in Caffe2, this operator is stateful: it assumes
+that the input weight matrix W never changes, so it is only suitable for
+inference time when the weight matrix never gets updated by any other ops.
+Due to performance considerations, this is not checked in non-debug builds.
+)DOC");
+
+SHOULD_NOT_DO_GRADIENT(PackedFC);
+} // namespace caffe2
+
+#endif // CAFFE2_HAS_MKL_SGEMM_PACK
diff --git a/caffe2/operators/partition_ops.cc b/caffe2/operators/partition_ops.cc
index ca998b9..84a0b04 100644
--- a/caffe2/operators/partition_ops.cc
+++ b/caffe2/operators/partition_ops.cc
@@ -3,13 +3,14 @@
namespace caffe2 {
namespace {
-REGISTER_CPU_OPERATOR(Sharding, ShardingOp<CPUContext>);
+REGISTER_CPU_OPERATOR(Partition, PartitionOp);
+REGISTER_CPU_OPERATOR(LengthsPartition, LengthsPartitionOp);
-OPERATOR_SCHEMA(Sharding)
- .NumInputsOutputs([](int in, int out) {
- return in > 0 && out > 0 && out % in == 0;
- })
- .SetDoc(R"DOC(
+OPERATOR_SCHEMA(Shard)
+ .NumInputsOutputs([](int in, int out) {
+ return in > 0 && out > 0 && out % in == 0;
+ })
+ .SetDoc(R"DOC(
Sharding splits the input int tensor into multiple ones according to the first
tensor.
@@ -25,19 +26,69 @@
X_ij / num_partitions.
Outputs are ordered as
-X_0_part_0, X_0_part_1, ..., X_0_part_K-1, X_1_part_0, ..., X_N-1_part_K-1
+X_0_part_0, X_1_part_0, ..., X_N-1_part_0, X_0_part_1, ..., X_N-1_part_K-1
)DOC")
- .Arg("pack_first_input", "(int, default 0) If set, the operator transforms "
- "the first tensor values as floor(X_ij / num_partitions)")
- .Input(0, "input", "Input tensor containing data to be sharded. The "
- "number of input tensors might be greater than 1 but must have the "
- "same shape as the previous tensors.")
- .Output(0, "shards", "Output Shards. The number of output shards has to be a "
- "multiple of the number of input shards.");
+ .Arg(
+ "pack_first_input",
+ "(int, default 0) If set, the operator transforms "
+ "the first tensor values as floor(X_ij / num_partitions)")
+ .Input(
+ 0,
+ "input",
+ "Input tensor containing data to be sharded. The "
+ "number of input tensors might be greater than 1 but must have the "
+ "same shape as the previous tensors.")
+ .Output(
+ 0,
+ "shards",
+ "Output Shards. The number of output shards has to be a "
+ "multiple of the number of input shards.");
+
+OPERATOR_SCHEMA(LengthsSharding)
+ .NumInputsOutputs([](int in, int out) {
+ return in >= 2 && out > 0 && out % in == 0;
+ })
+ .SetDoc(R"DOC(
+LengthsSharding splits the input int tensor into multiple ones according to the
+second tensor. The first dimension is expected to be the tensor that describes
+lengths of the elements.
+
+Takes the second input and partitions it to shards according to the remainder of
+values modulo the number of partitions. It requires the second tensor to be
+a 1D-tensor of the integral type. The first tensor should be 1D-tensor of int32
+that would represent the lengths of the elements in the input. The number of
+partitions is derived as (num_output / num_input).
+
+If additional inputs are present they must have the same shape as the first
+input, optionally with extra trailing dimensions. They will be partitioned
+accordingly to the first input.
+
+Optional arg 'pack_first_input' transforms the first tensor values as
+X_ij / num_partitions.
+
+Outputs are ordered as
+X_0_part_0, X_1_part_0, ..., X_N-1_part_0, X_0_part_1, ..., X_N-1_part_K-1
+)DOC")
+ .Arg(
+ "pack_first_input",
+ "(int, default 0) If set, the operator transforms "
+ "the first tensor values as floor(X_ij / num_partitions)")
+ .Input(
+ 0,
+ "input",
+ "Input tensor containing data to be sharded. The "
+ "number of input tensors might be greater than 1 but must have the "
+ "same shape as the previous tensors.")
+ .Output(
+ 0,
+ "shards",
+ "Output Shards. The number of output shards has to be a "
+ "multiple of the number of input shards.");
// This should actually have gradient, but for now nothing uses it.
// Because gradient computation right now is not input/output aware it can't be
// GRADIENT_NOT_IMPLEMENTEDYET
NO_GRADIENT(Sharding);
+NO_GRADIENT(ShardingLengths);
} // namespace
} // namespace caffe2
diff --git a/caffe2/operators/partition_ops.h b/caffe2/operators/partition_ops.h
index f2bbefa..1f1c74f 100644
--- a/caffe2/operators/partition_ops.h
+++ b/caffe2/operators/partition_ops.h
@@ -6,29 +6,27 @@
namespace caffe2 {
-template <class Context>
-class ShardingOp : public Operator<Context> {
+class PartitionOpBase : public Operator<CPUContext> {
public:
- USE_OPERATOR_CONTEXT_FUNCTIONS;
- USE_DISPATCH_HELPER;
+ USE_OPERATOR_FUNCTIONS(CPUContext);
- ShardingOp(const OperatorDef& operator_def, Workspace* ws)
- : Operator<Context>(operator_def, ws),
+ PartitionOpBase(const OperatorDef& operator_def, Workspace* ws)
+ : Operator<CPUContext>(operator_def, ws),
OP_SINGLE_ARG(int, "pack_first_input", pack_first_input_, 0) {}
- bool RunOnDevice() override {
- return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(0));
- }
-
- private:
+ protected:
template <typename Index>
- bool DoRunWithType() {
- CHECK_EQ(OutputSize() % InputSize(), 0)
- << "Output number must be a multiple of input number";
+ void ApplyPartition(bool skipFirstArgument) {
+ CAFFE_ENFORCE_EQ(
+ OutputSize() % InputSize(),
+ 0,
+ "Output number must be a multiple of input number");
int partitions = OutputSize() / InputSize();
- CHECK_GT(partitions, 0);
+ int inputSize = InputSize();
+ int mainInputIndex = skipFirstArgument;
+ CAFFE_ENFORCE_GT(partitions, 0, "Invalid number of partitions");
- auto& main_input = Input(0);
+ auto& main_input = Input(mainInputIndex);
TIndex size = main_input.size();
const Index* data = main_input.template data<Index>();
counts_.assign(partitions, 0);
@@ -40,32 +38,43 @@
++counts_[shard];
}
- raw_datas_.resize(InputSize());
- block_sizes_.resize(InputSize());
+ raw_datas_.resize(inputSize);
+ block_sizes_.resize(inputSize);
+ metas_.resize(inputSize);
out_datas_.resize(OutputSize());
- for (int i = 0; i < InputSize(); ++i) {
+ for (int i = mainInputIndex; i < inputSize; ++i) {
auto& input = Input(i);
- if (i > 0) {
- CHECK_GE(input.ndim(), main_input.ndim())
- << "Prefix of extra input's shape must match main input's shape, "
- << "input: " << i;
+ if (i > mainInputIndex) {
+ CAFFE_ENFORCE_GE(
+ input.ndim(),
+ main_input.ndim(),
+ "Prefix of extra input's shape must match main input's shape, ",
+ "input: ",
+ i);
for (int j = 0; j < main_input.ndim(); ++j) {
- CHECK_GE(input.dim(j), main_input.dim(j))
- << "Prefix of extra input's shape must match main input's shape, "
- << "input: " << i << ", dim " << j;
+ CAFFE_ENFORCE_GE(
+ input.dim(j),
+ main_input.dim(j),
+ "Prefix of extra input's shape must match main input's shape, ",
+ "input: ",
+ i,
+ ", dim ",
+ j);
}
- CHECK(input.meta().copy() == nullptr)
- << "Only primitive types are supported, input " << i;
+ CAFFE_ENFORCE(
+ input.meta().copy() == nullptr,
+ "Only primitive types are supported, input ",
+ i);
}
raw_datas_[i] = input.raw_data();
- block_sizes_[i] =
- input.size_from_dim(main_input.ndim()) * input.itemsize();
+ block_sizes_[i] = input.size_from_dim(main_input.ndim());
+ metas_[i] = input.meta();
// shape = partition_size + suffix of input dims
vector<TIndex> shape(
input.dims().begin() + main_input.ndim() - 1, input.dims().end());
for (int j = 0; j < partitions; ++j) {
- int out_idx = i * partitions + j;
- auto* output = Output(out_idx);
+ int out_idx = i + j * inputSize;
+ auto output = Output(out_idx);
shape[0] = counts_[j];
output->Resize(shape);
out_datas_[out_idx] = output->raw_mutable_data(input.meta());
@@ -81,21 +90,22 @@
TIndex idx = counts_[shard]++;
// special case first input
- static_cast<Index*>(out_datas_[shard])[idx] =
+ static_cast<Index*>(out_datas_[shard * inputSize + mainInputIndex])[idx] =
pack_first_input_ ? ((data[p] - shard) / partitions) : data[p];
- for (int i = 1, j = shard + partitions; i < InputSize();
- ++i, j += partitions) {
+ int baseIndex = shard * inputSize;
+ for (int i = mainInputIndex + 1; i < inputSize; ++i) {
auto bs = block_sizes_[i];
+ auto meta = metas_[i];
// special case for small bs?
- context_.template CopyBytes<Context, Context>(
+ context_.template CopyItems<CPUContext, CPUContext>(
+ meta,
bs,
- static_cast<const char*>(raw_datas_[i]) + p * bs,
- static_cast<char*>(out_datas_[j]) + idx * bs);
+ static_cast<const char*>(raw_datas_[i]) + p * bs * meta.itemsize(),
+ static_cast<char*>(out_datas_[baseIndex + i]) +
+ idx * bs * meta.itemsize());
}
}
-
- return true;
}
bool pack_first_input_;
@@ -103,10 +113,101 @@
// use member fields to reuse memory
vector<TIndex> counts_;
vector<TIndex> block_sizes_;
+ vector<TypeMeta> metas_;
vector<const void*> raw_datas_;
vector<void*> out_datas_;
+};
- DISABLE_COPY_AND_ASSIGN(ShardingOp);
+class PartitionOp : public PartitionOpBase {
+ public:
+ USE_DISPATCH_HELPER;
+
+ PartitionOp(const OperatorDef& operator_def, Workspace* ws)
+ : PartitionOpBase(operator_def, ws) {}
+
+ bool RunOnDevice() override {
+ return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(0));
+ }
+
+ private:
+ template <typename Index>
+ bool DoRunWithType() {
+ ApplyPartition<Index>(false /* skipFirstArgument */);
+ return true;
+ }
+
+ DISABLE_COPY_AND_ASSIGN(PartitionOp);
+};
+
+class LengthsPartitionOp : public PartitionOpBase {
+ public:
+ USE_DISPATCH_HELPER;
+
+ LengthsPartitionOp(const OperatorDef& operator_def, Workspace* ws)
+ : PartitionOpBase(operator_def, ws) {}
+
+ bool RunOnDevice() override {
+ return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(1));
+ }
+
+ private:
+ template <typename Index>
+ bool DoRunWithType() {
+ CAFFE_ENFORCE(
+ OutputSize() % InputSize() == 0,
+ "Output number must be a multiple of input number");
+ int partitions = OutputSize() / InputSize();
+ CAFFE_ENFORCE_GT(partitions, 0, "Invalid number of partitions");
+ CAFFE_ENFORCE_EQ(
+ Input(1).ndim(),
+ 1,
+ "Only 1-D tensors supported as a partitioning tensor for sharding");
+
+ // Apply sharding to all parameters except lengths
+ ApplyPartition<Index>(true /* skipFirstArgument */);
+
+ // Compute lengths after sharding
+ auto& main_input = Input(1);
+ TIndex size = main_input.size();
+ const Index* data = main_input.template data<Index>();
+
+ auto& length_input = Input(0);
+ TIndex elements = length_input.size();
+ const int32_t* lengths_data = length_input.template data<int32_t>();
+ out_length_.resize(partitions);
+ for (int i = 0; i < partitions; ++i) {
+ auto& output = *Output(i * InputSize());
+ output.Resize(elements);
+ out_length_[i] = output.template mutable_data<int32_t>();
+ }
+
+ int total_length = 0;
+ for (int i = 0; i < elements; ++i) {
+ total_length += lengths_data[i];
+ }
+ CAFFE_ENFORCE(
+ total_length == size,
+ "Total length is not matching to the number of elements");
+
+ int index = 0;
+ for (int i = 0; i < elements; ++i) {
+ for (int j = 0; j < partitions; ++j) {
+ out_length_[j][i] = 0;
+ }
+ for (int j = 0; j < lengths_data[i]; ++j, ++index) {
+ // TODO: support other partition functions
+ int shard = data[index] % partitions;
+ // equivalent to `if (shard < 0) shard += partitions;`
+ shard += partitions & (shard >> (sizeof(int) * 8 - 1));
+ ++out_length_[shard][i];
+ }
+ }
+ return true;
+ }
+
+ DISABLE_COPY_AND_ASSIGN(LengthsPartitionOp);
+
+ vector<int32_t*> out_length_;
};
} // namespace caffe2
diff --git a/caffe2/operators/recurrent_network_op.h b/caffe2/operators/recurrent_network_op.h
index e73eac9..a15ef6af 100644
--- a/caffe2/operators/recurrent_network_op.h
+++ b/caffe2/operators/recurrent_network_op.h
@@ -336,14 +336,15 @@
const auto stepNet =
OperatorBase::GetSingleArgument<string>("backward_step_net", "");
NetDef stepNetDef;
- CHECK(google::protobuf::TextFormat::ParseFromString(stepNet, &stepNetDef));
+ CAFFE_ENFORCE(
+ google::protobuf::TextFormat::ParseFromString(stepNet, &stepNetDef));
ws_.CreateBlob(timestep_)->template GetMutable<TensorCPU>()->Resize(1);
for (const auto& blob : stepNetDef.external_input()) {
ws_.CreateBlob(blob);
}
stepNet_ = ws_.CreateNet(stepNetDef);
- CHECK(stepNet_);
+ CAFFE_ENFORCE(stepNet_);
}
std::vector<detail::Scratch> constructScratches(Workspace* sharedWs) {
@@ -483,7 +484,7 @@
->template Get<Tensor<Context>>();
auto* ag = CHECK_NOTNULL(ws_.GetBlob(param.accGrad))
->template GetMutable<Tensor<Context>>();
- CHECK(ag->dims() == g.dims());
+ CAFFE_ENFORCE(ag->dims() == g.dims());
math::Add<T, Context>(
g.size(),
g.template data<T>(),
diff --git a/caffe2/operators/recurrent_op_cudnn.cc b/caffe2/operators/recurrent_op_cudnn.cc
index d8379a5..6dffab1 100644
--- a/caffe2/operators/recurrent_op_cudnn.cc
+++ b/caffe2/operators/recurrent_op_cudnn.cc
@@ -72,7 +72,7 @@
CHECK_GT(hiddenSize, 0);
const auto bidirectional =
OperatorBase::GetSingleArgument<int>("bidirectional", 0);
- CHECK(bidirectional == 0 || bidirectional == 1);
+ CAFFE_ENFORCE(bidirectional == 0 || bidirectional == 1);
const auto numDirections = bidirectional == 1 ? 2 : 1;
const auto outputDim = hiddenSize * numDirections;
const auto rnnDirection =
@@ -81,11 +81,11 @@
CHECK_GT(numLayers, 0);
const auto& rnnModeStr =
OperatorBase::GetSingleArgument<string>("rnn_mode", "");
- CHECK(rnnModeStr == "lstm" || rnnModeStr == "gru");
+ CAFFE_ENFORCE(rnnModeStr == "lstm" || rnnModeStr == "gru");
const auto rnnMode = rnnModeStr == "lstm" ? CUDNN_LSTM : CUDNN_GRU;
const auto& rnnInputStr =
OperatorBase::GetSingleArgument<string>("input_mode", "");
- CHECK(rnnInputStr == "linear" || rnnInputStr == "skip");
+ CAFFE_ENFORCE(rnnInputStr == "linear" || rnnInputStr == "skip");
const auto rnnInput =
rnnInputStr == "linear" ? CUDNN_LINEAR_INPUT : CUDNN_SKIP_INPUT;
// Dropout setup
diff --git a/caffe2/operators/segment_reduction_op.cc b/caffe2/operators/segment_reduction_op.cc
index 0fe44be..d21ae39 100644
--- a/caffe2/operators/segment_reduction_op.cc
+++ b/caffe2/operators/segment_reduction_op.cc
@@ -39,15 +39,19 @@
const SIndex* s_ids = segment_ids.template data<SIndex>();
const T* d = data.template data<T>();
- CHECK_GT(N, 0);
- const SIndex K = s_ids[N - 1] + 1;
+ const SIndex K = N > 0 ? s_ids[N - 1] + 1 : 0;
auto shape = data.dims();
shape[0] = K;
output->Resize(shape);
- TIndex block_size = data.size() / N;
T* out = output->template mutable_data<T>();
+ if (N == 0) {
+ return true;
+ }
+
+ TIndex block_size = data.size() / N;
+
// Assume the segments are sorted and there are no gaps
CHECK_EQ(0, s_ids[0]) << "Indices must be sorted and not have gaps";
for (TIndex i = 0; i < N;) {
@@ -106,9 +110,14 @@
data_grads->Resize(shape);
const SIndex K = segment_grads.dim(0);
- TIndex block_size = segment_grads.size() / K;
T* out = data_grads->template mutable_data<T>();
+ if (N == 0) {
+ return true;
+ }
+
+ TIndex block_size = segment_grads.size_from_dim(1);
+
// Assume the segments are sorted and there are no gaps
CHECK_EQ(0, s_ids[0]) << "Indices must be sorted and not have gaps";
// repeat the check from forward op
@@ -264,7 +273,7 @@
output->Resize(shape);
TIndex in_block_size = data.size_from_dim(num_reduce_dims_);
- TIndex block_num = data.size() / in_block_size;
+ TIndex block_num = in_block_size > 0 ? data.size() / in_block_size : 0;
T* out = output->template mutable_data<T>();
Reducer r(ctx, out, &context_);
@@ -321,7 +330,7 @@
data_grads->Resize(shape);
TIndex block_size = data_grads->size_from_dim(num_reduce_dims_);
- TIndex block_num = data_grads->size() / block_size;
+ TIndex block_num = block_size > 0 ? data_grads->size() / block_size : 0;
T* out = data_grads->template mutable_data<T>();
ReducerGradient r(ctx, r_grad, &context_);
@@ -431,11 +440,9 @@
USE_SIMPLE_CTOR_DTOR(AbstractSortedSegmentOp);
bool RunOnDevice() override {
- auto& data = Input(0);
- const TIndex M = data.dim(0);
// If more complicated fixed size logic becomes necessary, it can be moved
// to the reducer class
- TIndex in_block_size = data.size() / M;
+ TIndex in_block_size = Input(0).size_from_dim(1);
return DispatchHelper<typename Reducer::FixedDispatch>::call(
this, in_block_size);
}
@@ -477,16 +484,18 @@
const SIndex* s_ids = segment_ids.template data<SIndex>();
const T* d = data.template data<T>();
- CHECK_GT(N, 0);
- const SIndex K = s_ids[N - 1] + 1;
+ const SIndex K = N > 0 ? s_ids[N - 1] + 1 : 0;
vector<TIndex> shape;
shape.push_back(K);
ctx.appendOutputShape(&shape);
output->Resize(shape);
- TIndex in_block_size = data.size() / M;
- TIndex out_block_size = output->size() / K;
T* out = output->template mutable_data<T>();
+ if (N == 0) {
+ return true;
+ }
+ TIndex in_block_size = data.size_from_dim(1);
+ TIndex out_block_size = output->size_from_dim(1);
// Assume the segments are sorted and there are no gaps
CHECK_EQ(0, s_ids[0]) << "Indices must be sorted and not have gaps";
@@ -497,8 +506,12 @@
for (; i < N && s_ids[start] == s_ids[i]; ++i) {
TIndex idx;
if (SparseFused) { // static if
- CHECK(0 <= idxs[i] && idxs[i] < M)
- << "Index out of bounds: " << idxs[i] << ", range 0 to " << M;
+ CAFFE_ENFORCE(
+ 0 <= idxs[i] && idxs[i] < M,
+ "Index out of bounds: ",
+ idxs[i],
+ ", range 0 to ",
+ M);
idx = idxs[i];
} else {
idx = i;
@@ -532,10 +545,9 @@
USE_SIMPLE_CTOR_DTOR(AbstractSortedSegmentGradientOp);
bool RunOnDevice() override {
- auto& segment_grads = Input(SEGMENT_GRADS);
// If more complicated fixed size logic becomes necessary, it can be moved
// to the reducer class
- TIndex grad_block_size = segment_grads.size() / segment_grads.dim(0);
+ TIndex grad_block_size = Input(SEGMENT_GRADS).size_from_dim(1);
return DispatchHelper<typename ReducerGradient::FixedDispatch>::call(
this, grad_block_size);
}
@@ -566,11 +578,15 @@
ctx.appendGradShape(&shape);
data_grads->Resize(shape);
- TIndex d_block_size = data_grads->size() / data_grads->dim(0);
+ TIndex d_block_size = data_grads->size_from_dim(1);
const SIndex K = segment_grads.dim(0);
- TIndex s_block_size = segment_grads.size() / K;
+ TIndex s_block_size = segment_grads.size_from_dim(1);
T* out = data_grads->template mutable_data<T>();
+ if (N == 0) {
+ return true;
+ }
+
// Assume the segments are sorted and there are no gaps
CHECK_EQ(0, s_ids[0]) << "Indices must be sorted and not have gaps";
// repeat the check from forward op
@@ -784,11 +800,9 @@
OP_SINGLE_ARG(int, "num_segments", num_segments_, -1) {}
bool RunOnDevice() override {
- auto& data = Input(0);
- const TIndex M = data.dim(0);
// If more complicated fixed size logic becomes necessary, it can be moved
// to the reducer class
- TIndex in_block_size = data.size() / M;
+ TIndex in_block_size = Input(0).size_from_dim(1);
return DispatchHelper<typename Reducer::FixedDispatch>::call(
this, in_block_size);
}
@@ -846,8 +860,8 @@
ctx.appendOutputShape(&shape);
output->Resize(shape);
- TIndex in_block_size = data.size() / M;
- TIndex out_block_size = output->size() / K;
+ TIndex in_block_size = data.size_from_dim(1);
+ TIndex out_block_size = output->size_from_dim(1);
T* out = output->template mutable_data<T>();
reducers_.clear();
@@ -858,12 +872,20 @@
for (TIndex i = 0; i < N; ++i) {
auto s_id = s_ids[i];
- CHECK(0 <= s_id && s_id < K) << "Segment id out of range: " << s_id
- << ", range 0 to " << K;
+ CAFFE_ENFORCE(
+ 0 <= s_id && s_id < K,
+ "Segment id out of range: ",
+ s_id,
+ ", range 0 to ",
+ K);
TIndex idx;
if (SparseFused) { // static if
- CHECK(0 <= idxs[i] && idxs[i] < M) << "Index out of bounds: " << idxs[i]
- << ", range 0 to " << M;
+ CAFFE_ENFORCE(
+ 0 <= idxs[i] && idxs[i] < M,
+ "Index out of bounds: ",
+ idxs[i],
+ ", range 0 to ",
+ M);
idx = idxs[i];
} else {
idx = i;
@@ -897,10 +919,9 @@
USE_SIMPLE_CTOR_DTOR(AbstractUnsortedSegmentGradientOp);
bool RunOnDevice() override {
- auto& segment_grads = Input(SEGMENT_GRADS);
// If more complicated fixed size logic becomes necessary, it can be moved
// to the reducer class
- TIndex grad_block_size = segment_grads.size() / segment_grads.dim(0);
+ TIndex grad_block_size = Input(SEGMENT_GRADS).size_from_dim(1);
return DispatchHelper<typename ReducerGradient::FixedDispatch>::call(
this, grad_block_size);
}
@@ -931,9 +952,9 @@
ctx.appendGradShape(&shape);
data_grads->Resize(shape);
- TIndex d_block_size = data_grads->size() / data_grads->dim(0);
+ TIndex d_block_size = data_grads->size_from_dim(1);
const SIndex K = segment_grads.dim(0);
- TIndex s_block_size = segment_grads.size() / K;
+ TIndex s_block_size = segment_grads.size_from_dim(1);
T* out = data_grads->template mutable_data<T>();
reducers_.clear();
@@ -944,8 +965,12 @@
for (TIndex i = 0; i < N; ++i) {
auto s_id = s_ids[i];
- CHECK(0 <= s_id && s_id < K) << "Segment id out of range: " << s_id
- << ", range 0 to " << K;
+ CAFFE_ENFORCE(
+ 0 <= s_id && s_id < K,
+ "Segment id out of range: ",
+ s_id,
+ ", range 0 to ",
+ K);
reducers_[s_id].template fillGrad<FixedSize>(
ctx, out + d_block_size * i, i, &context_);
}
@@ -1086,6 +1111,351 @@
true /*SparseFused*/>;
};
+/**
+ * @brief Segment reduction op with optional fused embedding lookup
+ *
+ * Base implementation for LengthsXXX and SparseLengthsXXX depending
+ * on SparseFused static argument.
+ *
+ * Inputs:
+ * 0: DATA - input embedding to do lookups in
+ * 1..P: AUX_ARG_<I> - optional additional arguments to be passed to the
+ * reducer, should have the same first dimension as
+ * LENGTHS (e.g. scalars in WeightedSum)
+ * # if SparseFused == true:
+ * P+1: INDICES - 1-D vector with indices to look up in DATA. Should have the
+ * same dimension as LENGTHS
+ * # P+1 if SparseFused == false:
+ * P+1 or P+2: LENGTHS - lengths on indecies vector
+ *
+ * Output:
+ * Tensor with first dimension of K, where K = len(LENGTHS). Rest
+ * of dimensions are decided by reducer but usually are the same size as extra
+ * dimensions of DATA
+ */
+// TODO(dzhulgakov): for now it's implemented with incremental reducers because
+// of fused sparse support. But using "lengths" representation actually implies
+// continuous segments and thus range reducers can be used for non-sparse
+// version.
+template <
+ typename TData,
+ typename TLengths,
+ class Context,
+ class Reducer,
+ bool SparseFused = true>
+class AbstractLengthsOp : public Operator<Context> {
+ public:
+ USE_OPERATOR_CONTEXT_FUNCTIONS;
+ USE_SIMPLE_CTOR_DTOR(AbstractLengthsOp);
+
+ bool RunOnDevice() override {
+ // If more complicated fixed size logic becomes necessary, it can be moved
+ // to the reducer class
+ TIndex dataBlockSize = Input(0).size_from_dim(1);
+ return DispatchHelper<typename Reducer::FixedDispatch>::call(
+ this, dataBlockSize);
+ }
+
+ template <int FixedSize>
+ bool DoRunWithValue() {
+ auto& dataInput = Input(0);
+ auto& lengthsInput = Input(LENGTHS);
+ auto* output = Output(0);
+
+ CHECK_EQ(1, lengthsInput.ndim()) << "LENGTHS must be a vector";
+ const TIndex dataSize = dataInput.dim(0);
+ // Either first dim the data or how much we pull in indexies from it
+ TIndex dataToReduceSize;
+ const TIndex outputSize = lengthsInput.dim(0);
+
+ const TIndex* indicies;
+ if (SparseFused) { // static if
+ auto& indicesInput = Input(INDICES);
+ CHECK_EQ(1, indicesInput.ndim()) << "INDICES must be a vector";
+ indicies = indicesInput.template data<TIndex>();
+ dataToReduceSize = indicesInput.dim(0);
+ } else {
+ dataToReduceSize = dataSize;
+ }
+
+ typename Reducer::Meta ctx;
+ ctx.observeInput(0, dataInput, 1);
+ for (int i = 1; i < Reducer::kInputCount; ++i) {
+ auto& aux_in = Input(i);
+ CAFFE_ENFORCE(
+ dataToReduceSize == aux_in.dim(0),
+ "Input ",
+ i,
+ " must have have the same first dim as SEGMENT_IDS");
+ ctx.observeInput(i, aux_in, 1);
+ }
+
+ const TLengths* lengths = lengthsInput.template data<TLengths>();
+ const TData* data = dataInput.template data<TData>();
+
+ vector<TIndex> shape{outputSize};
+ ctx.appendOutputShape(&shape);
+ output->Resize(shape);
+
+ TIndex in_block_size = dataInput.size_from_dim(1);
+ TIndex out_block_size = output->size_from_dim(1);
+ TData* out = output->template mutable_data<TData>();
+
+ TIndex dataIndex = 0;
+ for (TIndex rangeIndex = 0; rangeIndex < outputSize; ++rangeIndex) {
+ Reducer reducer(ctx, out + out_block_size * rangeIndex, &context_);
+ for (TIndex start = dataIndex; dataIndex < start + lengths[rangeIndex];
+ ++dataIndex) {
+ TIndex idx;
+ if (SparseFused) { // static if
+ idx = indicies[dataIndex];
+ CAFFE_ENFORCE(
+ 0 <= idx && idx < dataSize,
+ "Index ",
+ dataIndex,
+ " is out of bounds: ",
+ idx,
+ ", range 0 to ",
+ dataSize);
+ } else {
+ idx = dataIndex;
+ CAFFE_ENFORCE(
+ idx < dataSize,
+ "Range ",
+ rangeIndex,
+ " of length ",
+ lengths[rangeIndex],
+ " is out of bound ",
+ dataSize);
+ }
+ reducer.template process<FixedSize>(
+ ctx, data + in_block_size * idx, dataIndex, &context_);
+ }
+ }
+ CAFFE_ENFORCE(
+ dataIndex == dataToReduceSize, dataIndex, " != ", dataToReduceSize);
+ return true;
+ }
+
+ enum {
+ INDICES = Reducer::kInputCount,
+ LENGTHS = Reducer::kInputCount + (SparseFused ? 1 : 0)
+ };
+ static constexpr int kSelfInputs = SparseFused ? 2 : 1;
+ static constexpr int kNumInputs = Reducer::kInputCount + kSelfInputs;
+};
+
+// Gradient actually doesn't depend on whether sparse lookup is fused or not
+template <typename T, typename TLengths, class Context, class ReducerGradient>
+class AbstractLengthsGradientOp : public Operator<Context> {
+ public:
+ USE_OPERATOR_CONTEXT_FUNCTIONS;
+ USE_SIMPLE_CTOR_DTOR(AbstractLengthsGradientOp);
+
+ bool RunOnDevice() override {
+ // If more complicated fixed size logic becomes necessary, it can be moved
+ // to the reducer class
+ TIndex gradBlockSize = Input(SEGMENT_GRADS).size_from_dim(1);
+ return DispatchHelper<typename ReducerGradient::FixedDispatch>::call(
+ this, gradBlockSize);
+ }
+
+ template <int FixedSize>
+ bool DoRunWithValue() {
+ auto& segmentGradsInput = Input(SEGMENT_GRADS);
+ auto& lengthsInput = Input(LENGTHS);
+ auto* dataGradsOutput = Output(0);
+
+ CAFFE_ENFORCE(lengthsInput.ndim() == 1, "LENGTHS must be a vector");
+ TIndex reducedDataSize = 0;
+ TIndex numSegments = lengthsInput.dim(0);
+ CAFFE_ENFORCE(segmentGradsInput.ndim() > 0);
+ CAFFE_ENFORCE(numSegments == segmentGradsInput.dim(0));
+ const TLengths* lengths = lengthsInput.template data<TLengths>();
+ for (TIndex i = 0; i < numSegments; ++i) {
+ reducedDataSize += lengths[i];
+ }
+
+ typename ReducerGradient::Meta ctx(segmentGradsInput, 1);
+ for (int i = 0; i < ReducerGradient::originalInputs().size(); ++i) {
+ auto& aux_in = Input(i);
+ CAFFE_ENFORCE_EQ(
+ reducedDataSize,
+ aux_in.dim(0),
+ "Input ",
+ i,
+ " must have have the same first dim as SEGMENT_IDS");
+ ctx.observeOriginalInput(ReducerGradient::originalInputs()[i], aux_in, 1);
+ }
+
+ const T* segmentGrads = segmentGradsInput.template data<T>();
+
+ vector<TIndex> shape;
+ shape.push_back(reducedDataSize);
+ ctx.appendGradShape(&shape);
+ dataGradsOutput->Resize(shape);
+
+ TIndex dataGradsBlockSize = dataGradsOutput->size_from_dim(1);
+ TIndex segmentBlockSize = segmentGradsInput.size_from_dim(1);
+ T* dataGrads = dataGradsOutput->template mutable_data<T>();
+
+ TIndex dataIndex = 0;
+ for (TIndex rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) {
+ ReducerGradient reducer(
+ ctx, segmentGrads + segmentBlockSize * rangeIndex, &context_);
+ for (TIndex start = dataIndex; dataIndex < start + lengths[rangeIndex];
+ ++dataIndex) {
+ reducer.template fillGrad<FixedSize>(
+ ctx,
+ dataGrads + dataGradsBlockSize * dataIndex,
+ dataIndex,
+ &context_);
+ }
+ }
+ CAFFE_ENFORCE(
+ dataIndex == reducedDataSize, dataIndex, " != ", reducedDataSize);
+ return true;
+ }
+
+ // Input layout:
+ // orig_arg1, orig_arg2, ..., orig_argN, SEGMENT_GRADS, SEGMENT_IDS
+ // orig_argXs represent original op's inputs and will be passed to the reducer
+ // directly
+ static constexpr int kNumInputs =
+ ReducerGradient::originalInputs().size() + 2;
+ enum _InputTags {
+ SEGMENT_GRADS = ReducerGradient::originalInputs().size(),
+ LENGTHS
+ };
+};
+
+// base implementation of sparse/non-sparse gradient computation
+template <
+ typename ForwardOp,
+ typename ReducerDef,
+ typename ReducerGradient,
+ bool SparseFused>
+struct LengthsOpGetGradient : public GradientMakerBase {
+ using GradientMakerBase::GradientMakerBase;
+ vector<OperatorDef> GetGradientDefs() override {
+ vector<string> grad_ins;
+ for (const int i : ReducerGradient::originalInputs()) {
+ grad_ins.push_back(I(i));
+ }
+ grad_ins.push_back(GO(0));
+ grad_ins.push_back(I(ForwardOp::LENGTHS));
+ vector<OperatorDef> r{CreateOperatorDef(
+ string("Lengths") + ReducerDef::name + "Gradient",
+ "",
+ grad_ins,
+ // no gradient on segment_ids or auxiliary inputs for now
+ vector<string>{SparseFused ? GI_V(0) : GI(0)})};
+ if (SparseFused) {
+ SetSparse(0, I(ForwardOp::INDICES), GI_V(0));
+ }
+ return r;
+ }
+};
+
+template <typename T, typename SIndex, typename Context, typename ReducerDef>
+struct AbstractLengthsDef {
+ using OpDef = ReducerDef;
+ static constexpr const char* basename = "Lengths";
+ static constexpr const char* doc = R"DOC(
+Applies '{op}' to each segment of the input tensor. Segments are defined
+by their LENGTHS.
+
+LENGTHS is a vector that maps each of the first dimension slices of the
+DATA to a particular group (segment). Values belonging to the same segment are
+aggregated together.
+
+For example LENGTHS = [2, 1] stands for segments DATA[0..1] and DATA[2]
+
+The first dimension of the output is equal to the number of input segments,
+i.e. `len(LENGTHS)`. Other dimensions are inherited from the input tensor.
+
+{op_doc}
+ )DOC";
+ static void PopulateSchema(OpSchema& schema) {
+ schema.Input(0, "DATA", "Input tensor, slices of which are aggregated.");
+ schema.Input(
+ Reducer::kInputCount,
+ "LENGTHS",
+ "Vector with the same sum of elements as the first dimension of DATA");
+ schema.Output(
+ 0,
+ "OUTPUT",
+ "Aggregated output tensor. Has the first dimension of len(LENGTHS) ");
+ ReducerDef::PopulateSchema(schema);
+ }
+ using Reducer = typename ReducerDef::template Reducer<T, Context>;
+ using ReducerGradient =
+ typename ReducerDef::template ReducerGradient<T, Context>;
+ using ForwardOp = AbstractLengthsOp<T, SIndex, Context, Reducer, false>;
+ using BackwardOp =
+ AbstractLengthsGradientOp<T, SIndex, Context, ReducerGradient>;
+ using GetGradient = LengthsOpGetGradient<
+ ForwardOp,
+ ReducerDef,
+ ReducerGradient,
+ false /*SparseFused*/>;
+};
+
+template <typename T, typename SIndex, typename Context, typename ReducerDef>
+struct AbstractSparseLengthsDef {
+ using OpDef = ReducerDef;
+ static constexpr const char* basename = "SparseLengths";
+ static constexpr const char* doc = R"DOC(
+Pulls in slices of the input tensor, groups them into segments and applies
+'{op}' to each segment. Segments are defined by their LENGTHS.
+
+This op is basically Gather and Lengths{op} fused together.
+
+INDICES should contain integers in range 0..N-1 where N is the first dimension
+of DATA. INDICES represent which slices of DATA need to be pulled in.
+
+LENGTHS is a vector that defines slice sizes by first dimention of DATA. Values
+belonging to the same segment are aggregated together. sum(LENGTHS) has
+to match INDICES size.
+
+The first dimension of the output is equal to the number of input segment,
+i.e. `len(LENGTHS)`. Other dimensions are inherited from the input tensor.
+
+{op_doc}
+ )DOC";
+ static void PopulateSchema(OpSchema& schema) {
+ schema.Input(0, "DATA", "Input tensor, slices of which are aggregated.");
+ schema.Input(
+ Reducer::kInputCount,
+ "INDICES",
+ "Integer vector containing indices of the first dimension of DATA for "
+ "the slices that are being aggregated");
+ schema.Input(
+ Reducer::kInputCount + 1,
+ "LENGTHS",
+ "Non negative vector with sum of elements equal to INDICES length");
+ schema.Output(
+ 0,
+ "OUTPUT",
+ "Aggregated output tensor. Has the first dimension of K "
+ "(the number of segments).");
+ ReducerDef::PopulateSchema(schema);
+ }
+ using Reducer = typename ReducerDef::template Reducer<T, Context>;
+ using ReducerGradient =
+ typename ReducerDef::template ReducerGradient<T, Context>;
+ using ForwardOp = AbstractLengthsOp<T, SIndex, Context, Reducer>;
+ // TODO(dzhulgakov): we're registering the same class twice here,
+ // consider avoiding op duplication here
+ using BackwardOp =
+ AbstractLengthsGradientOp<T, SIndex, Context, ReducerGradient>;
+ using GetGradient = LengthsOpGetGradient<
+ ForwardOp,
+ ReducerDef,
+ ReducerGradient,
+ true /*SparseFused*/>;
+};
+
namespace {
template <typename Def>
@@ -1135,17 +1505,21 @@
REGISTER_SEGMENT_DEF(
AbstractSortedSegmentRangeDef<float, int, CPUContext, MaxRangeReducerDef>);
-#define REGISTER_REDUCER_WITH_ALL_OPS(reducer_def) \
- REGISTER_SEGMENT_DEF( \
- AbstractReduceFrontDef<float, CPUContext, reducer_def>); \
- REGISTER_SEGMENT_DEF( \
- AbstractSortedSegmentDef<float, int, CPUContext, reducer_def>); \
- REGISTER_SEGMENT_DEF( \
- AbstractSparseSortedSegmentDef<float, int, CPUContext, reducer_def>); \
- REGISTER_SEGMENT_DEF( \
- AbstractUnsortedSegmentDef<float, int, CPUContext, reducer_def>); \
- REGISTER_SEGMENT_DEF( \
- AbstractSparseUnsortedSegmentDef<float, int, CPUContext, reducer_def>)
+#define REGISTER_REDUCER_WITH_ALL_OPS(reducer_def) \
+ REGISTER_SEGMENT_DEF( \
+ AbstractReduceFrontDef<float, CPUContext, reducer_def>); \
+ REGISTER_SEGMENT_DEF( \
+ AbstractSortedSegmentDef<float, int, CPUContext, reducer_def>); \
+ REGISTER_SEGMENT_DEF( \
+ AbstractSparseSortedSegmentDef<float, int, CPUContext, reducer_def>); \
+ REGISTER_SEGMENT_DEF( \
+ AbstractUnsortedSegmentDef<float, int, CPUContext, reducer_def>); \
+ REGISTER_SEGMENT_DEF( \
+ AbstractSparseUnsortedSegmentDef<float, int, CPUContext, reducer_def>) \
+ REGISTER_SEGMENT_DEF( \
+ AbstractLengthsDef<float, int, CPUContext, reducer_def>) \
+ REGISTER_SEGMENT_DEF( \
+ AbstractSparseLengthsDef<float, int, CPUContext, reducer_def>)
REGISTER_REDUCER_WITH_ALL_OPS(SumReducerDef);
REGISTER_REDUCER_WITH_ALL_OPS(WeightedSumReducerDef);
diff --git a/caffe2/operators/sequence_ops.cc b/caffe2/operators/sequence_ops.cc
index 9d6a394..d7b10e1 100644
--- a/caffe2/operators/sequence_ops.cc
+++ b/caffe2/operators/sequence_ops.cc
@@ -34,17 +34,17 @@
bool DoRunWithType() {
const auto& in = Input(0);
CHECK_GE(in.ndim(), 1);
- const auto outer_size = in.dims()[0];
+ const int32_t outer_size = in.dims()[0];
const auto block_size = std::accumulate(
in.dims().begin() + 1, in.dims().end(), 1, std::multiplies<TIndex>());
const auto pad_width = startPaddingWidth_ + endPaddingWidth_;
// if no lengths is provided, assume it is a single full-span entry
- const int64_t* lengths_ptr = &outer_size;
+ const int32_t* lengths_ptr = &outer_size;
int64_t lengths_size = 1;
if (InputSize() > 1) {
const auto& lengths = Input(1);
- lengths_ptr = lengths.data<int64_t>();
+ lengths_ptr = lengths.data<int32_t>();
lengths_size = lengths.size();
}
@@ -124,17 +124,17 @@
bool DoRunWithType() {
const auto& in = Input(0);
CHECK_GE(in.ndim(), 1);
- const auto outer_size = in.dims()[0];
+ const int32_t outer_size = in.dims()[0];
const auto block_size = std::accumulate(
in.dims().begin() + 1, in.dims().end(), 1, std::multiplies<TIndex>());
const auto pad_width = startPaddingWidth_ + endPaddingWidth_;
// if no lengths is provided, assume it is a single full-span entry
- const int64_t* lengths_ptr = &outer_size;
+ const int32_t* lengths_ptr = &outer_size;
int64_t lengths_size = 1;
if (InputSize() > 1) {
const auto& lengths = Input(1);
- lengths_ptr = lengths.data<int64_t>();
+ lengths_ptr = lengths.data<int32_t>();
lengths_size = lengths.size();
}
@@ -167,8 +167,8 @@
std::transform(
lengths_ptr,
lengths_ptr + lengths_size,
- lengths_out->mutable_data<int64_t>(),
- [pad_width](int64_t x) { return x - pad_width; });
+ lengths_out->mutable_data<int32_t>(),
+ [pad_width](int32_t x) { return x - pad_width; });
return true;
}
@@ -207,16 +207,16 @@
bool DoRunWithType() {
const auto& in = Input(0);
CHECK_GE(in.ndim(), 1);
- const auto outer_size = in.dims()[0];
+ const int32_t outer_size = in.dims()[0];
const auto block_size = std::accumulate(
in.dims().begin() + 1, in.dims().end(), 1, std::multiplies<TIndex>());
// if no lengths is provided, assume it is a single full-span entry
- const int64_t* lengths_ptr = &outer_size;
+ const int32_t* lengths_ptr = &outer_size;
int64_t lengths_size = 1;
if (InputSize() > 1) {
const auto& lengths = Input(1);
- lengths_ptr = lengths.data<int64_t>();
+ lengths_ptr = lengths.data<int32_t>();
lengths_size = lengths.size();
}
@@ -288,8 +288,8 @@
std::transform(
lengths_ptr,
lengths_ptr + lengths_size,
- lengths_out->mutable_data<int64_t>(),
- [pad_width](int64_t x) { return x + pad_width; });
+ lengths_out->mutable_data<int32_t>(),
+ [pad_width](int32_t x) { return x + pad_width; });
return true;
}
diff --git a/caffe2/operators/sparse_to_dense_mask_op.cc b/caffe2/operators/sparse_to_dense_mask_op.cc
index 8e641b3..4ab1d2c 100644
--- a/caffe2/operators/sparse_to_dense_mask_op.cc
+++ b/caffe2/operators/sparse_to_dense_mask_op.cc
@@ -36,23 +36,6 @@
template <typename TInd>
bool DoRunWithType() {
- if (InputSize() < 4) {
- return DoRunWithTypeAndLength<TInd, int32_t>();
- } else {
- const TypeMeta& meta = Input(LENGTHS).meta();
- if (meta.Match<int32_t>()) {
- return DoRunWithTypeAndLength<TInd, int32_t>();
- } else if (meta.Match<int64_t>()) {
- return DoRunWithTypeAndLength<TInd, int64_t>();
- } else {
- CAFFE_THROW("Unsupported type of tensor: ", meta.name());
- return false;
- }
- }
- }
-
- template <typename TInd, typename TLen>
- bool DoRunWithTypeAndLength() {
auto& sparse_indices = Input(INDICES);
CAFFE_ENFORCE(sparse_indices.ndim() == 1);
auto& sparse_values = Input(VALUES);
@@ -73,14 +56,14 @@
int cols = featuresCount_;
int rows = 0;
- TLen default_length = sparse_indices.dim32(0);
- const TLen* lengths_vec = nullptr;
+ int32_t default_length = sparse_indices.dim32(0);
+ const int32_t* lengths_vec = nullptr;
auto* output = Output(0);
vector<TIndex> shape;
if (InputSize() == 4) {
auto& lengths = Input(LENGTHS);
CAFFE_ENFORCE(lengths.ndim() == 1);
- lengths_vec = lengths.data<TLen>();
+ lengths_vec = lengths.data<int32_t>();
rows = lengths.dim32(0);
}
if (rows == 0) {
@@ -107,7 +90,7 @@
output_data + i * block_nbytes);
}
- TLen offset = 0;
+ int32_t offset = 0;
for (int r = 0; r < rows; r++) {
for (int c = 0; c < lengths_vec[r]; c++) {
int idx = getFeatureIdx(sparse_indices_vec[offset + c]);
@@ -163,8 +146,8 @@
the value of `default_value`. After running this op:
```
-output[indices[i], :] = values[i]
-output[j, :] = default_value # for j not in indices
+output[j, :] = values[i] # where mask[j] == indices[i]
+output[j, ...] = default_value # when mask[j] doesn't appear in indices
```
If `lengths` is provided and not empty, and extra "batch" dimension is prepended
diff --git a/caffe2/operators/spatial_batch_norm_op.cc b/caffe2/operators/spatial_batch_norm_op.cc
index 47b81ca..2378b90 100644
--- a/caffe2/operators/spatial_batch_norm_op.cc
+++ b/caffe2/operators/spatial_batch_norm_op.cc
@@ -139,7 +139,89 @@
template <>
bool SpatialBNGradientOp<CPUContext>::RunOnDevice() {
- CAFFE_THROW("Spatial BN gradient on the CPU is not implemented yet.");
+ const auto& X = Input(INPUT);
+ const auto& dY = Input(OUTPUT_GRAD);
+ const auto& scale = Input(SCALE);
+
+ DCHECK_EQ(X.ndim(), 4);
+ const int N = X.dim32(0);
+ const int C = (order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(3));
+ const int H = (order_ == StorageOrder::NCHW ? X.dim32(2) : X.dim32(1));
+ const int W = (order_ == StorageOrder::NCHW ? X.dim32(3) : X.dim32(2));
+ DCHECK_EQ(scale.ndim(), 1);
+ DCHECK_EQ(scale.dim32(0), C);
+
+ ConstEigenVectorArrayMap<float> scale_arr(scale.data<float>(), C);
+ ConstEigenVectorArrayMap<float> mean_arr(Input(SAVED_MEAN).data<float>(), C);
+ ConstEigenVectorArrayMap<float> inv_var_arr(
+ Input(SAVED_INV_VAR).data<float>(), C);
+
+ auto* dX = Output(INPUT_GRAD);
+ auto* dScale = Output(SCALE_GRAD);
+ auto* dBias = Output(BIAS_GRAD);
+ dX->ResizeLike(X);
+ dScale->ResizeLike(scale);
+ dBias->ResizeLike(scale);
+
+ // dBias = np.sum(dY, axis=0)
+ // dScale = np.sum((X - mean) / inv_std * dy, axis=0)
+ // dX = (1. / N) * scale * inv_var * (N * dY - np.sum(dY, axis=0) - (X - mean)
+ // * inv_var * inv_var * np.sum(dY * (X - mean), axis=0))
+
+ EigenVectorArrayMap<float> dBias_arr(dBias->mutable_data<float>(), C);
+ EigenVectorArrayMap<float> dScale_arr(dScale->mutable_data<float>(), C);
+
+ dBias_arr.setZero();
+ dScale_arr.setZero();
+
+ const auto scaleInvVarNHW = scale_arr * inv_var_arr / (N * H * W);
+
+ switch (order_) {
+ case StorageOrder::NCHW: {
+ ConstEigenArrayMap<float> X_arr(X.data<float>(), H * W, N * C);
+ ConstEigenArrayMap<float> dY_arr(dY.data<float>(), H * W, N * C);
+ EigenArrayMap<float> dX_arr(dX->mutable_data<float>(), H * W, N * C);
+ dX_arr.setZero();
+
+ for (int nc = 0; nc < N * C; ++nc) {
+ int c = nc % C;
+ dBias_arr(c) += dY_arr.col(nc).sum();
+ dScale_arr(c) +=
+ ((X_arr.col(nc) - mean_arr(c)) * inv_var_arr(c) * dY_arr.col(nc))
+ .sum();
+ }
+ for (int nc = 0; nc < N * C; ++nc) {
+ int c = nc % C;
+ dX_arr.col(nc) += scaleInvVarNHW(c) *
+ (dY_arr.col(nc) * N * H * W - dBias_arr(c) -
+ (X_arr.col(nc) - mean_arr[c]) * dScale_arr(c) * inv_var_arr(c));
+ }
+ break;
+ }
+ case StorageOrder::NHWC: {
+ ConstEigenArrayMap<float> X_arr(X.data<float>(), C, N * H * W);
+ ConstEigenArrayMap<float> dY_arr(dY.data<float>(), C, N * H * W);
+ EigenArrayMap<float> dX_arr(dX->mutable_data<float>(), C, N * H * W);
+ dX_arr.setZero();
+
+ const auto dYRowSum = dY_arr.rowwise().sum();
+ const auto XMinusMean = X_arr.colwise() - mean_arr;
+ const auto dYMulXMinusMeanRowSum = (dY_arr * XMinusMean).rowwise().sum();
+ const auto invVarSqr = inv_var_arr * inv_var_arr;
+ for (int nhw = 0; nhw < N * H * W; ++nhw) {
+ dBias_arr += dY_arr.col(nhw);
+ dScale_arr +=
+ (X_arr.col(nhw) - mean_arr) * inv_var_arr * dY_arr.col(nhw);
+ dX_arr.col(nhw) += scaleInvVarNHW *
+ (dY_arr.col(nhw) * N * H * W - dYRowSum -
+ XMinusMean.col(nhw) * invVarSqr * dYMulXMinusMeanRowSum);
+ }
+ break;
+ }
+ default:
+ CAFFE_THROW("Unknown storage order: ", order_);
+ }
+ return true;
}
REGISTER_CPU_OPERATOR(SpatialBN, SpatialBNOp<CPUContext>);
@@ -223,7 +305,7 @@
bool is_test = false;
if (HasArgument(def_, "is_test")) {
const auto& arg = GetArgument(def_, "is_test");
- CHECK(arg.has_i());
+ CAFFE_ENFORCE(arg.has_i());
is_test = arg.i();
}
vector<string> grad_outputs{GI(0), GI(1), GI(2)};
@@ -235,8 +317,7 @@
// X, scale, dY, estimated_mean, estimated_variance
CHECK_EQ(def_.input_size(), 5);
CHECK_EQ(def_.output_size(), 1);
- grad_inputs = vector<string>{
- I(0), I(1), GO(0), I(3), I(4)};
+ grad_inputs = vector<string>{I(0), I(1), GO(0), I(3), I(4)};
} else {
CHECK_EQ(def_.input_size(), 5);
CHECK_EQ(def_.output_size(), 5);
@@ -247,4 +328,4 @@
}
};
REGISTER_GRADIENT(SpatialBN, GetSpatialBNGradient);
-} // namespace caffe2
+} // namespace caffe2
diff --git a/caffe2/operators/spatial_batch_norm_op.h b/caffe2/operators/spatial_batch_norm_op.h
index 824a8c3..e7435e2 100644
--- a/caffe2/operators/spatial_batch_norm_op.h
+++ b/caffe2/operators/spatial_batch_norm_op.h
@@ -19,7 +19,8 @@
order_(StringToStorageOrder(
OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
// TODO(jiayq): update the input and output size checks.
- CHECK((is_test_ && OutputSize() == 1) || (!is_test_ && OutputSize() == 5));
+ CAFFE_ENFORCE(
+ (is_test_ && OutputSize() == 1) || (!is_test_ && OutputSize() == 5));
CHECK_GT(epsilon_, 0);
CHECK_GE(momentum_, 0);
CHECK_LE(momentum_, 1);
@@ -49,8 +50,8 @@
epsilon_(OperatorBase::GetSingleArgument<float>("epsilon", 1e-5)),
order_(StringToStorageOrder(
OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
- CHECK(InputSize() == 5);
- CHECK_EQ(OutputSize(), 3);
+ CAFFE_ENFORCE(InputSize() == 5);
+ CAFFE_ENFORCE(OutputSize() == 3);
}
~SpatialBNGradientOp() {}
diff --git a/caffe2/operators/square_root_divide_op.cc b/caffe2/operators/square_root_divide_op.cc
new file mode 100644
index 0000000..375937b
--- /dev/null
+++ b/caffe2/operators/square_root_divide_op.cc
@@ -0,0 +1,45 @@
+#include "caffe2/operators/square_root_divide_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(
+ SquareRootDivide,
+ SquareRootDivideOp<int32_t, CPUContext>);
+OPERATOR_SCHEMA(SquareRootDivide)
+ .NumInputs(2)
+ .NumOutputs(1)
+ .AllowInplace({{0, 0}})
+ .SetDoc(R"DOC(
+Given DATA tensor with first dimention N and SCALE vector of the same size N
+produces an output tensor with same dimensions as DATA. Which consists of DATA
+slices. i-th slice is divided by sqrt(SCALE[i]) elementwise. If SCALE[i] == 0
+output slice is identical to the input one (no scaling)
+
+Example:
+
+ Data = [
+ [1.0, 2.0],
+ [3.0, 4.0]
+ ]
+
+ SCALE = [4, 9]
+
+ OUTPUT = [
+ [2.0, 4.0],
+ [9.0, 12.0]
+ ]
+
+)DOC");
+
+class GetSquareRootDivideGradient : public GradientMakerBase {
+ using GradientMakerBase::GradientMakerBase;
+ vector<OperatorDef> GetGradientDefs() override {
+ return SingleGradientDef(
+ "SquareRootDivide",
+ "",
+ vector<string>{GO(0), I(1)},
+ vector<string>{GI(0)});
+ }
+};
+REGISTER_GRADIENT(SquareRootDivide, GetSquareRootDivideGradient);
+} // namespace caffe2
diff --git a/caffe2/operators/square_root_divide_op.h b/caffe2/operators/square_root_divide_op.h
new file mode 100644
index 0000000..8d5d908
--- /dev/null
+++ b/caffe2/operators/square_root_divide_op.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename TScale, class Context>
+class SquareRootDivideOp final : public Operator<Context> {
+ public:
+ USE_OPERATOR_CONTEXT_FUNCTIONS;
+ USE_DISPATCH_HELPER;
+
+ SquareRootDivideOp(const OperatorDef& operator_def, Workspace* ws)
+ : Operator<Context>(operator_def, ws) {}
+
+ bool RunOnDevice() override {
+ return DispatchHelper<TensorTypes<float, double>>::call(this, Input(DATA));
+ }
+
+ private:
+ template <typename TData>
+ bool DoRunWithType() {
+ auto& data = Input(DATA);
+ auto& scale = Input(SCALE);
+ auto* Y = Output(0);
+ Y->ResizeLike(data);
+ size_t batchSize = data.dim(0);
+ size_t exampleSize = data.size_from_dim(1);
+ CAFFE_ENFORCE(batchSize == scale.dim(0), batchSize, " != ", scale.dim(0));
+ auto* scalePtr = scale.template data<TScale>();
+ auto* dataPtr = data.template data<TData>();
+ auto* yPtr = Y->template mutable_data<TData>();
+ for (int i = 0; i < batchSize; ++i) {
+ auto scale = scalePtr[i];
+ CAFFE_ENFORCE(scale >= 0, scale, " < 0");
+ auto multiplier = scale == 0 ? 1.0 : 1 / std::sqrt(scale);
+ math::Scale<TData, Context>(
+ exampleSize,
+ multiplier,
+ dataPtr + i * exampleSize,
+ yPtr + i * exampleSize,
+ &context_);
+ }
+ return true;
+ }
+
+ INPUT_TAGS(DATA, SCALE);
+};
+
+} // namespace caffe2
diff --git a/caffe2/operators/summarize_op.h b/caffe2/operators/summarize_op.h
index fe4d2a7..86d5e9c 100644
--- a/caffe2/operators/summarize_op.h
+++ b/caffe2/operators/summarize_op.h
@@ -24,9 +24,12 @@
log_file_.reset(new std::ofstream(
target_folder + "/" + def.input(0) + kSummaryzeOpExtension,
std::ofstream::out | std::ofstream::trunc));
- CHECK(log_file_->good())
- << "Failed to open summarize file for tensor " << def.input(0)
- << ". rdstate() = " << log_file_->rdstate();
+ CAFFE_ENFORCE(
+ log_file_->good(),
+ "Failed to open summarize file for tensor ",
+ def.input(0),
+ ". rdstate() = ",
+ log_file_->rdstate());
}
}
~SummarizeOp() { if (to_file_) log_file_->close(); }
diff --git a/caffe2/operators/tensor_protos_db_input.h b/caffe2/operators/tensor_protos_db_input.h
index 7b7ca3f..65e7fae 100644
--- a/caffe2/operators/tensor_protos_db_input.h
+++ b/caffe2/operators/tensor_protos_db_input.h
@@ -50,13 +50,13 @@
// deserialize everything into the target prefetched blob.
reader.Read(&key_, &value_);
TensorProtos protos;
- CHECK(protos.ParseFromString(value_));
- CHECK_EQ(protos.protos_size(), OutputSize());
+ CAFFE_ENFORCE(protos.ParseFromString(value_));
+ CAFFE_ENFORCE(protos.protos_size() == OutputSize());
for (int i = 0; i < protos.protos_size(); ++i) {
if (protos.protos(i).has_device_detail()) {
protos.mutable_protos(i)->clear_device_detail();
}
- CHECK(deserializer.Deserialize(
+ CAFFE_ENFORCE(deserializer.Deserialize(
protos.protos(i),
prefetched_blobs_[i].template GetMutable<TensorCPU>()));
}
@@ -65,8 +65,8 @@
for (int item_id = 0; item_id < batch_size_; ++item_id) {
reader.Read(&key_, &value_);
TensorProtos protos;
- CHECK(protos.ParseFromString(value_));
- CHECK_EQ(protos.protos_size(), OutputSize());
+ CAFFE_ENFORCE(protos.ParseFromString(value_));
+ CAFFE_ENFORCE(protos.protos_size() == OutputSize());
if (!shape_inferred_) {
// First, set the shape of all the blobs.
for (int i = 0; i < protos.protos_size(); ++i) {
@@ -82,7 +82,7 @@
if (protos.protos(i).has_device_detail()) {
protos.mutable_protos(i)->clear_device_detail();
}
- CHECK(deserializer.Deserialize(protos.protos(i), &src));
+ CAFFE_ENFORCE(deserializer.Deserialize(protos.protos(i), &src));
DCHECK_EQ(src.size() * batch_size_, dst->size());
this->context_.template CopyItems<CPUContext, CPUContext>(
src.meta(),
diff --git a/caffe2/operators/text_file_reader.cc b/caffe2/operators/text_file_reader.cc
index e74ab06..6c3d038 100644
--- a/caffe2/operators/text_file_reader.cc
+++ b/caffe2/operators/text_file_reader.cc
@@ -1,6 +1,7 @@
#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"
#include "caffe2/core/tensor.h"
+#include "caffe2/operators/text_file_reader_utils.h"
#include "caffe2/utils/string_utils.h"
namespace caffe2 {
@@ -154,6 +155,8 @@
TIndex batchSize_;
};
+CAFFE_KNOWN_TYPE(std::unique_ptr<TextFileReaderInstance>);
+
REGISTER_CPU_OPERATOR(CreateTextFileReader, CreateTextFileReaderOp);
REGISTER_CPU_OPERATOR(TextFileReaderRead, TextFileReaderReadOp);
diff --git a/caffe2/operators/text_file_reader_utils.cc b/caffe2/operators/text_file_reader_utils.cc
new file mode 100644
index 0000000..0264f0e
--- /dev/null
+++ b/caffe2/operators/text_file_reader_utils.cc
@@ -0,0 +1,118 @@
+#include "caffe2/operators/text_file_reader_utils.h"
+
+#include <fcntl.h>
+#include <unistd.h>
+#include <cstring>
+#include <sstream>
+
+namespace caffe2 {
+
+Tokenizer::Tokenizer(const std::vector<char>& delims, char escape)
+ : escape_(escape) {
+ reset();
+ std::memset(delimTable_, 0, sizeof(delimTable_));
+ for (int i = 0; i < delims.size(); ++i) {
+ delimTable_[(unsigned char)delims.at(i)] = i + 1;
+ }
+}
+
+void Tokenizer::reset() {
+ toBeSkipped_ = 0;
+ startDelimId_ = 0;
+ leftover_.clear();
+}
+
+void Tokenizer::next(char* start, char* end, TokenizedString& tokenized) {
+ tokenized.modifiedStrings_.clear();
+ tokenized.tokens_.clear();
+
+ char* currentStart = start;
+ std::string* copied = nullptr;
+ if (!leftover_.empty()) {
+ tokenized.modifiedStrings_.emplace_back(new std::string());
+ copied = tokenized.modifiedStrings_.back().get();
+ *copied = std::move(leftover_);
+ }
+
+ char* ch;
+ for (ch = start + toBeSkipped_; ch < end; ++ch) {
+ if (*ch == escape_) {
+ if (!copied) {
+ tokenized.modifiedStrings_.emplace_back(new std::string());
+ copied = tokenized.modifiedStrings_.back().get();
+ }
+ copied->append(currentStart, ch);
+ currentStart = ch + 1;
+ // skip next character, since it's escaped
+ ++ch;
+ continue;
+ }
+ int newDelimId = delimTable_[(unsigned char)*ch];
+ if (newDelimId > 0) {
+ // found delimiter
+ tokenized.tokens_.emplace_back();
+ auto& token = tokenized.tokens_.back();
+ token.startDelimId = startDelimId_;
+ if (copied) {
+ copied->append(currentStart, ch);
+ const char* c_str = copied->data();
+ token.start = c_str;
+ token.end = c_str + copied->size();
+ } else {
+ token.start = currentStart;
+ token.end = ch;
+ }
+ currentStart = ch + 1;
+ copied = nullptr;
+ startDelimId_ = newDelimId - 1;
+ }
+ }
+ tokenized.lastDelim_ = startDelimId_;
+
+ toBeSkipped_ = ch - end;
+ if (copied) {
+ copied->append(currentStart, end);
+ leftover_ = std::move(*copied);
+ } else {
+ leftover_.assign(currentStart, end);
+ }
+}
+
+FileReader::FileReader(const std::string& path, size_t bufferSize)
+ : bufferSize_(bufferSize), buffer_(new char[bufferSize]) {
+ fd_ = open(path.c_str(), O_RDONLY, 0777);
+ if (fd_ < 0) {
+ throw std::runtime_error(
+ "Error opening file for reading: " + std::string(std::strerror(errno)));
+ }
+}
+
+void FileReader::reset() {
+ if (lseek(fd_, 0, SEEK_SET) == -1) {
+ throw std::runtime_error(
+ "Error reseting file cursor: " + std::string(std::strerror(errno)));
+ }
+}
+
+FileReader::~FileReader() {
+ if (fd_ >= 0) {
+ close(fd_);
+ }
+}
+
+void FileReader::operator()(CharRange& range) {
+ char* buffer = buffer_.get();
+ auto numRead = read(fd_, buffer, bufferSize_);
+ if (numRead == -1) {
+ throw std::runtime_error(
+ "Error reading file: " + std::string(std::strerror(errno)));
+ }
+ if (numRead == 0) {
+ range.start = nullptr;
+ range.end = nullptr;
+ return;
+ }
+ range.start = buffer;
+ range.end = buffer + numRead;
+}
+}
diff --git a/caffe2/operators/text_file_reader_utils.h b/caffe2/operators/text_file_reader_utils.h
new file mode 100644
index 0000000..17e888a
--- /dev/null
+++ b/caffe2/operators/text_file_reader_utils.h
@@ -0,0 +1,116 @@
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace caffe2 {
+
+struct Token {
+ int startDelimId;
+ const char* start;
+ const char* end;
+};
+
+class TokenizedString {
+ // holder for strings that have been modified
+ std::vector<std::unique_ptr<std::string>> modifiedStrings_;
+ std::vector<Token> tokens_;
+ int lastDelim_;
+
+ public:
+ const std::vector<Token>& tokens() const {
+ return tokens_;
+ }
+ const int lastDelim() const {
+ return lastDelim_;
+ }
+ friend class Tokenizer;
+};
+
+class Tokenizer {
+ private:
+ int startDelimId_;
+ // state of the tokenizer
+ std::string leftover_;
+ // if we need to skip the first characters of the next batch because
+ // e.g. a escape char that was the last character of the last batch.
+ int toBeSkipped_;
+ int delimTable_[256];
+ const char escape_;
+
+ public:
+ Tokenizer(const std::vector<char>& delimiters, char escape);
+ void reset();
+ void next(char* start, char* end, TokenizedString& tokenized);
+};
+
+struct CharRange {
+ char* start;
+ char* end;
+};
+
+struct StringProvider {
+ virtual void operator()(CharRange&) = 0;
+ virtual void reset() = 0;
+ virtual ~StringProvider() {}
+};
+
+class BufferedTokenizer {
+ public:
+ BufferedTokenizer(const Tokenizer& t, StringProvider* p, int numPasses = 1)
+ : provider_(p), tokenizer_(t), tokenIndex_(0), numPasses_(numPasses) {}
+
+ bool next(Token& token) {
+ CharRange range;
+ while (tokenIndex_ >= tokenized_.tokens().size()) {
+ range.start = nullptr;
+ while (range.start == nullptr && pass_ < numPasses_) {
+ (*provider_)(range);
+ if (range.start == nullptr) {
+ ++pass_;
+ if (pass_ < numPasses_) {
+ provider_->reset();
+ tokenizer_.reset();
+ }
+ }
+ }
+ if (range.start == nullptr) {
+ return false;
+ }
+ tokenizer_.next(range.start, range.end, tokenized_);
+ tokenIndex_ = 0;
+ }
+ token = tokenized_.tokens()[tokenIndex_++];
+ return true;
+ };
+
+ int endDelim() const {
+ if (tokenIndex_ + 1 < tokenized_.tokens().size()) {
+ return tokenized_.tokens()[tokenIndex_ + 1].startDelimId;
+ }
+ return tokenized_.lastDelim();
+ }
+
+ private:
+ StringProvider* provider_;
+ Tokenizer tokenizer_;
+ TokenizedString tokenized_;
+ int tokenIndex_;
+ int numPasses_;
+ int pass_{0};
+};
+
+class FileReader : public StringProvider {
+ public:
+ explicit FileReader(const std::string& path, size_t bufferSize = 65536);
+ ~FileReader();
+ void operator()(CharRange& range) override;
+ void reset() override;
+
+ private:
+ const size_t bufferSize_;
+ int fd_;
+ std::unique_ptr<char[]> buffer_;
+};
+}
diff --git a/caffe2/utils/string_utils_test.cc b/caffe2/operators/text_file_reader_utils_test.cc
similarity index 97%
rename from caffe2/utils/string_utils_test.cc
rename to caffe2/operators/text_file_reader_utils_test.cc
index ab3e247..c2a8799 100644
--- a/caffe2/utils/string_utils_test.cc
+++ b/caffe2/operators/text_file_reader_utils_test.cc
@@ -6,6 +6,7 @@
#include "caffe2/utils/math.h"
#include "gtest/gtest.h"
+#include "caffe2/operators/text_file_reader_utils.h"
#include "caffe2/utils/string_utils.h"
#include <cstdio>
@@ -13,7 +14,7 @@
namespace caffe2 {
-TEST(StringTest, TokenizeTest) {
+TEST(TextFileReaderUtilsTest, TokenizeTest) {
TokenizedString tokenized;
std::string ch =
"label\1text\xc3\xbf\nlabel2\\\nTest\1tex\\\\t2\n"
diff --git a/caffe2/operators/utility_ops.cc b/caffe2/operators/utility_ops.cc
index 4f744d7..4e35918 100644
--- a/caffe2/operators/utility_ops.cc
+++ b/caffe2/operators/utility_ops.cc
@@ -6,6 +6,7 @@
REGISTER_CPU_OPERATOR(WallClockTime, WallClockTimeOp<CPUContext>);
REGISTER_CPU_OPERATOR(Print, PrintOp<CPUContext>);
REGISTER_CPU_OPERATOR(Flatten, FlattenOp<CPUContext>);
+REGISTER_CPU_OPERATOR(FlattenToVec, FlattenToVecOp<CPUContext>);
REGISTER_CPU_OPERATOR(Alias, AliasOp<CPUContext>);
REGISTER_CPU_OPERATOR(ResizeLike, ResizeLikeOp<CPUContext>);
REGISTER_CPU_OPERATOR(Sum, SumOp<float, CPUContext>);
@@ -16,8 +17,13 @@
ScatterWeightedSumOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(ScatterAssign, ScatterAssignOp<float, CPUContext>);
// From whatever the current context, ensure the output is TensorCPU
-REGISTER_CPU_OPERATOR(EnsureCPUOutput,
- CopyOp<CPUContext, CPUContext, CPUContext>);
+REGISTER_CPU_OPERATOR(
+ EnsureCPUOutput,
+ CopyOp<CPUContext, CPUContext, CPUContext>);
+// From CPU, copy it to whatever the current context
+REGISTER_CPU_OPERATOR(
+ CopyFromCPUInput,
+ CopyOp<CPUContext, CPUContext, CPUContext>);
REGISTER_CPU_OPERATOR(Copy, CopyOp<CPUContext, CPUContext, CPUContext>);
REGISTER_CPU_OPERATOR(Shape, ShapeOp<CPUContext>);
REGISTER_CPU_OPERATOR(Reshape, ReshapeOp<float, CPUContext>);
@@ -25,6 +31,7 @@
REGISTER_CPU_OPERATOR(HasElements, HasElementsOp<CPUContext>);
REGISTER_CPU_OPERATOR(IsEmpty, IsEmptyOp<CPUContext>);
REGISTER_CPU_OPERATOR(Gather, GatherOp<CPUContext>);
+REGISTER_CPU_OPERATOR(GatherRanges, GatherRangesOp<CPUContext>);
REGISTER_CPU_OPERATOR(Unique, UniqueOp<CPUContext>);
REGISTER_CPU_OPERATOR(LengthsToSegmentIds, LengthsToSegmentIdsOp<CPUContext>);
REGISTER_CPU_OPERATOR(LengthsToRanges, LengthsToRangesOp<CPUContext>);
@@ -32,6 +39,9 @@
REGISTER_CPU_OPERATOR(Slice, SliceOp<int, CPUContext>);
REGISTER_CPU_OPERATOR(Squeeze, SqueezeOp<CPUContext>);
REGISTER_CPU_OPERATOR(ExpandDims, ExpandDimsOp<CPUContext>);
+REGISTER_CPU_OPERATOR(
+ SegmentIdsToLengthWeights,
+ SegmentIdsToLengthWeightsOp<CPUContext>);
OPERATOR_SCHEMA(WallClockTime)
.NumInputs(0)
@@ -104,6 +114,18 @@
"with first dimension equal first dimension of input, and remaining "
"input dimensions flatenned into the inner dimension of the output.");
+OPERATOR_SCHEMA(FlattenToVec)
+ .NumInputs(1)
+ .NumOutputs(1)
+ .SetDoc(R"DOC(
+Flattens the input tensor into a 1D vector.
+)DOC")
+ .Input(0, "input", "A tensor of rank >= 1.")
+ .Output(
+ 0,
+ "output",
+ "A tensor of rank 1 with the contents of the input tensor");
+
OPERATOR_SCHEMA(Alias)
.NumInputs(1)
.NumOutputs(1)
@@ -257,6 +279,16 @@
.Input(0, "input", "The input CUDA or CPU tensor.")
.Output(0, "output", "TensorCPU that is a copy of the input.");
+OPERATOR_SCHEMA(CopyFromCPUInput)
+ .NumInputs(1)
+ .NumOutputs(1)
+ .SetDoc(R"DOC(
+Take a CPU input tensor and copy it to an output in the current
+Context (GPU or CPU). This may involves cross-device MemCpy.
+)DOC")
+ .Input(0, "input", "The input CPU tensor.")
+ .Output(0, "output", "either a TensorCUDA or a TensorCPU");
+
OPERATOR_SCHEMA(Shape)
.NumInputs(1)
.NumOutputs(1)
@@ -312,6 +344,49 @@
.Input(1, "INDICES", "Tensor of int32/int64 indices, of any rank q.")
.Output(0, "OUTPUT", "Tensor of rank q + (r - 1).");
+OPERATOR_SCHEMA(GatherRanges)
+ .NumInputs(2)
+ .NumOutputs(2)
+ .SetDoc(R"DOC(
+Given DATA tensor of rank 1, and RANGES tensor of rank 3, gather
+corresponding ranges into a 1-D tensor OUTPUT.
+
+RANGES dimentions description:
+1: represents list of examples within a batch
+2: represents list features
+3: two values which are start and length or a range (to be applied on DATA)
+
+Another output LENGTHS represents each example length within OUTPUT
+
+Example:
+ DATA = [1, 2, 3, 4, 5, 6]
+ RANGES = [
+ [
+ [0, 1],
+ [2, 2],
+ ],
+ [
+ [4, 1],
+ [5, 1],
+ ]
+ ]
+ OUTPUT = [1, 3, 4, 5, 6]
+ LENGTHS = [3, 2]
+)DOC")
+ .Input(0, "DATA", "Tensor of rank 1.")
+ .Input(
+ 1,
+ "RANGES",
+ "Tensor of int32/int64 ranges, of dims (N, M, 2). "
+ "Where N is number of examples and M is a size of each example. "
+ "Last dimention represents a range in the format (start, lengths)")
+ .Output(0, "OUTPUT", "1-D tensor of size sum of range lengths")
+ .Output(
+ 1,
+ "LENGTHS",
+ "1-D tensor of size N with lengths over gathered data"
+ " for each row in a batch. sum(LENGTHS) == OUTPUT.size()");
+
OPERATOR_SCHEMA(Unique)
.NumInputs(1)
.NumOutputs(1, 2)
@@ -367,6 +442,20 @@
.Input(0, "segment_ids", "1-D int32_t or int64_t tensor of segment ids")
.Output(0, "lengths", "1-D int64_t tensor of segment lengths");
+OPERATOR_SCHEMA(SegmentIdsToLengthWeights)
+ .NumInputs(1)
+ .NumOutputs(1)
+ .Arg("power", "n of 1/pow(length,n) for normalization")
+ .SetDoc(
+ R"DOC( Similar as SegmentIdsToLengths but output vector of segment
+weights derived by lengths. i.e 1/pow(length, power)
+)DOC")
+ .Input(0, "segment_ids", "1-D int32_t or int64_t tensor of segment ids")
+ .Output(
+ 0,
+ "a vector of weights",
+ "1-D float tensor of segment weights by length");
+
OPERATOR_SCHEMA(Slice)
.NumInputs(3)
.NumOutputs(1)
@@ -403,8 +492,7 @@
.AllowInplace({{0, 0}})
.SetDoc(R"DOC(
Remove single-dimensional entries from the shape of a tensor.
-Takes an optional parameter `dims` with a list of dimension to squeeze.
-If `dims` is not provided, all singleton dimensions are squeezed.
+Takes a parameter `dims` with a list of dimension to squeeze.
If the same blob is provided in input and output, the operation is copy-free.
This is the exact inverse operation of ExpandDims given the same `dims` arg.
)DOC")
@@ -522,8 +610,10 @@
SHOULD_NOT_DO_GRADIENT(Unique);
SHOULD_NOT_DO_GRADIENT(LengthsToSegmentIds);
SHOULD_NOT_DO_GRADIENT(SegmentIdsToLengths);
+SHOULD_NOT_DO_GRADIENT(SegmentIdsToLengthWeights);
// TODO(azzolini): Add support for slice gradient
SHOULD_NOT_DO_GRADIENT(Slice);
+SHOULD_NOT_DO_GRADIENT(GatherRangesOp);
} // namespace
diff --git a/caffe2/operators/utility_ops.h b/caffe2/operators/utility_ops.h
index b7ca5ba..3b4486f 100644
--- a/caffe2/operators/utility_ops.h
+++ b/caffe2/operators/utility_ops.h
@@ -54,9 +54,12 @@
log_file_.reset(new std::ofstream(
target_folder + "/" + def().input(0) + kPrintFileExtension,
std::ofstream::out | std::ofstream::trunc));
- CHECK(log_file_->good()) << "Failed to open PrintOp file for tensor "
- << def().input(0)
- << ". rdstate() = " << log_file_->rdstate();
+ CAFFE_ENFORCE(
+ log_file_->good(),
+ "Failed to open PrintOp file for tensor ",
+ def().input(0),
+ ". rdstate() = ",
+ log_file_->rdstate());
}
}
@@ -201,6 +204,27 @@
}
};
+template <class Context>
+class FlattenToVecOp : public Operator<Context> {
+ public:
+ USE_OPERATOR_CONTEXT_FUNCTIONS;
+ USE_SIMPLE_CTOR_DTOR(FlattenToVecOp);
+
+ bool RunOnDevice() override {
+ auto& input = Input(0);
+ auto* output = Output(0);
+ DCHECK_GT(input.size(), 0);
+ output->Resize(input.size());
+
+ context_.template CopyItems<Context, Context>(
+ input.meta(),
+ input.size(),
+ input.raw_data(),
+ output->raw_mutable_data(input.meta()));
+ return true;
+ }
+};
+
// Output gets the data of input(0), but reshapes it like input(1).
template <class Context>
class ResizeLikeOp : public Operator<Context> {
@@ -240,10 +264,16 @@
T* output_data = output->template mutable_data<T>();
// Dimension checking
for (int i = 1; i < InputSize(); ++i) {
- CHECK(output->dims() == Input(i).dims())
- << ProtoDebugString(def()) << "\n"
- << output->dims() << "\n"
- << "Input " << i << ": " << Input(i).dims();
+ if (output->dims() != Input(i).dims()) {
+ CAFFE_THROW(
+ "Check failed: output->dims() == Input(i).dims().",
+ "Description: Input #",
+ i,
+ ", input dimension:",
+ Input(i).dims(),
+ " should match output dimension: ",
+ output->dims());
+ }
}
// Add the first two - works if in-place or not.
@@ -532,14 +562,9 @@
USE_SIMPLE_CTOR_DTOR(LengthsToSegmentIdsOp);
bool RunOnDevice() override {
- return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(0));
- }
-
- template <typename Index>
- bool DoRunWithType() {
auto& input = Input(0);
auto* output = Output(0);
- auto* input_data = input.template data<Index>();
+ auto* input_data = input.template data<int32_t>();
CAFFE_ENFORCE(input.dims().size() == 1, "Input must be a vector.");
auto total_length =
@@ -564,22 +589,17 @@
USE_SIMPLE_CTOR_DTOR(LengthsToRangesOp);
bool RunOnDevice() override {
- return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(0));
- }
-
- template <typename Index>
- bool DoRunWithType() {
auto& input = Input(0);
auto* output = Output(0);
- auto* input_data = input.template data<Index>();
+ auto* input_data = input.template data<int32_t>();
CAFFE_ENFORCE(input.dims().size() == 1, "Input must be a vector.");
auto size = input.size();
output->Resize(size, 2);
- auto* output_data = output->template mutable_data<Index>();
+ auto* output_data = output->template mutable_data<int32_t>();
- Index offset = 0;
+ int32_t offset = 0;
for (int i = 0; i < size; ++i) {
auto len = input_data[i];
output_data[i * 2] = offset;
@@ -611,7 +631,7 @@
auto num_segments = input_size ? input_data[input_size - 1] + 1 : 0;
CAFFE_ENFORCE(0 <= num_segments, "Indices must be in 0..K-1 range");
output->Resize(num_segments);
- auto* output_data = output->template mutable_data<int64_t>();
+ auto* output_data = output->template mutable_data<int32_t>();
if (num_segments == 0) {
return true;
}
@@ -632,6 +652,83 @@
}
};
+template <class Context>
+class SegmentIdsToLengthWeightsOp : public Operator<Context> {
+ public:
+ USE_OPERATOR_CONTEXT_FUNCTIONS;
+ SegmentIdsToLengthWeightsOp(const OperatorDef& operator_def, Workspace* ws)
+ : Operator<Context>(operator_def, ws),
+ power_(OperatorBase::GetSingleArgument<float>("power", 0.5)) {}
+
+ bool RunOnDevice() override {
+ return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(0));
+ }
+
+ template <typename Index>
+ bool DoRunWithType() {
+ auto& input = Input(0);
+ CAFFE_ENFORCE(input.dims().size() == 1, "Input must be a vector.");
+ auto* input_data = input.template data<Index>();
+ auto input_size = input.size();
+ auto* output = Output(0);
+
+ // segment id starts from 0
+ auto num_segments = input_size ? input_data[input_size - 1] + 1 : 0;
+ CAFFE_ENFORCE(0 <= num_segments, "Indices must be in 0..K-1 range");
+
+ std::vector<int64_t> seg_lengths(num_segments, 0);
+
+ output->Resize(input_size);
+ auto* output_data = output->template mutable_data<float>();
+ if (num_segments == 0) {
+ return true;
+ }
+ std::fill(output_data, output_data + num_segments, 0);
+
+ Index prev = input_data[0];
+ for (int64_t i = 0; i < input_size; i++) {
+ CAFFE_ENFORCE(
+ prev == input_data[i] || prev + 1 == input_data[i],
+ "Segment ids must be sorted and at least size 1: ",
+ prev,
+ " vs ",
+ input_data[i]);
+ prev = input_data[i];
+ seg_lengths[input_data[i]] += 1;
+ }
+
+ int64_t in = 0;
+
+ std::function<float(const int64_t& length, const float& power)> getWeight;
+
+ if (power_ == 0.5) {
+ getWeight = [](const int64_t& length, const float& power) {
+ return 1.0 / sqrt(length);
+ };
+ } else if (power_ == 1) {
+ getWeight = [](const int64_t& length, const float& power) {
+ return 1.0 / length;
+ };
+ } else {
+ getWeight = [](const int64_t& length, const float& power) {
+ return 1.0 / pow(length, power);
+ };
+ }
+
+ for (int64_t i = 0; i < num_segments; i++) {
+ float weight = getWeight(seg_lengths[i], power_);
+ for (int64_t j = 0; j < seg_lengths[i]; j++) {
+ output_data[in++] = weight;
+ }
+ }
+
+ return true;
+ }
+
+ private:
+ float power_;
+};
+
template <class SIndex, class Context>
class SliceOp : public Operator<Context> {
public:
@@ -848,12 +945,23 @@
if (unknown_idx != -1) {
CAFFE_ENFORCE(
total_size % size == 0,
- "Argument `shape` does not agree with the input data.");
+ "Argument `shape` does not agree with the input data.",
+ " (",
+ total_size,
+ " vs ",
+ size,
+ ")");
new_shape_[unknown_idx] = total_size / size;
} else {
- CAFFE_ENFORCE(
- total_size == size,
- "Argument `shape` does not agree with the input data.");
+ CAFFE_ENFORCE_EQ(
+ total_size,
+ size,
+ "Argument `shape` does not agree with the input data.",
+ " (",
+ total_size,
+ " != ",
+ size,
+ ")");
}
// Write the original shape to the second output.
@@ -887,16 +995,11 @@
USE_SIMPLE_CTOR_DTOR(LengthsToShapeOp);
bool RunOnDevice() override {
- return DispatchHelper<TensorTypes<int, long>>::call(this, Input(0));
- }
-
- template <typename T>
- bool DoRunWithType() {
auto& input = Input(0);
CAFFE_ENFORCE(input.dims().size() == 1, "Input must be a vector.");
auto* output = Output(0);
- auto* input_data = input.template data<T>();
+ auto* input_data = input.template data<int32_t>();
auto size = input.size();
auto first = input_data[0];
@@ -907,7 +1010,7 @@
}
output->Resize(2);
- auto* output_data = output->template mutable_data<T>();
+ auto* output_data = output->template mutable_data<int32_t>();
output_data[0] = size;
output_data[1] = first;
@@ -923,13 +1026,14 @@
: Operator<Context>(operator_def, ws),
dims_(OperatorBase::GetRepeatedArgument<int>("dims")) {
auto originalSize = dims_.size();
+ CAFFE_ENFORCE(originalSize > 0, "Parameter `dims` must be provided.");
+
std::sort(dims_.begin(), dims_.end());
std::unique(dims_.begin(), dims_.end());
if (dims_.size() < originalSize) {
LOG(WARNING) << "Parameter `dims` has repeated dimensions.";
}
- CHECK(dims_.empty() || dims_.front() >= 0)
- << "Dimension ids must be non-negative.";
+ CAFFE_ENFORCE(dims_.front() >= 0, "Dimension ids must be non-negative.");
}
bool RunOnDevice() override {
@@ -947,13 +1051,11 @@
for (int i = 0; i < input.dims().size(); ++i) {
if (j < dims_.size() && dims_[j] == i) {
CAFFE_ENFORCE(
- input.dims()[i] == 1, "Dimension ", i, " of input must be 1.");
+ input.dims()[i] == 1, "Dimension ", i, " of input must be 1",
+ " instead of ", input.dims()[i], ".");
++j;
continue;
- } else if (dims_.empty() && input.dim(i) == 1) {
- continue;
}
-
newDims.push_back(input.dims().at(i));
}
output->Reshape(newDims);
@@ -975,13 +1077,13 @@
: Operator<Context>(operator_def, ws),
dims_(OperatorBase::GetRepeatedArgument<int>("dims")) {
auto originalSize = dims_.size();
+ CAFFE_ENFORCE(originalSize > 0, "Parameter `dims` must be provided.");
std::sort(dims_.begin(), dims_.end());
std::unique(dims_.begin(), dims_.end());
if (dims_.size() < originalSize) {
LOG(WARNING) << "Parameter `dims` has repeated dimensions.";
}
- CHECK(dims_.empty() || dims_.front() >= 0)
- << "Dimension ids must be non-negative.";
+ CAFFE_ENFORCE(dims_.front() >= 0, "Dimension ids must be non-negative.");
}
bool RunOnDevice() override {
@@ -1020,7 +1122,7 @@
template <typename Index>
bool DoRunWithType() {
- // If we endup using it on GPU doint O(N) memcpy is probably not best :)
+ // If we endup using it on GPU doing O(N) memcpy is probably not best :)
// TODO: implement prefetching if it starts mattering (TF does it)
auto& data = Input(DATA);
auto& indices = Input(INDICES);
@@ -1053,6 +1155,86 @@
INPUT_TAGS(DATA, INDICES);
};
+template <class Context>
+class GatherRangesOp : public Operator<Context> {
+ public:
+ USE_OPERATOR_CONTEXT_FUNCTIONS;
+ USE_SIMPLE_CTOR_DTOR(GatherRangesOp);
+
+ bool RunOnDevice() override {
+ return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
+ this, OperatorBase::Input<TensorCPU>(RANGES));
+ }
+
+ template <typename Index>
+ bool DoRunWithType() {
+ auto& data = Input(DATA);
+ auto& ranges = Input(RANGES);
+ auto* outputData = Output(0);
+ auto* outputLengths = Output(1);
+
+ auto batchSize = ranges.dim(0);
+ CAFFE_ENFORCE(data.ndim() == 1, "Data has to be 1-D");
+ CAFFE_ENFORCE(ranges.ndim() == 3, "Ranges must be 3-D");
+ CAFFE_ENFORCE(batchSize > 0, "Batch of examples can't be empty");
+ CAFFE_ENFORCE(ranges.dim(1) > 0, "There has to be at least one range");
+ CAFFE_ENFORCE(ranges.dim(2), "Ranges last dimention should be of size 2");
+
+ auto* rawData = static_cast<const char*>(data.raw_data());
+ auto* rangesData = ranges.template data<Index>();
+
+ outputLengths->Resize(batchSize);
+ auto* outputLengthsPtr = outputLengths->template mutable_data<int32_t>();
+ size_t start = 0;
+ size_t blockSize = ranges.size() / batchSize;
+ for (size_t i = 0; i < batchSize; ++i) {
+ auto end = start + blockSize;
+ outputLengthsPtr[i] = accumulate(rangesData, start, end);
+ start = end;
+ }
+
+ size_t outputSize = accumulate(rangesData, 0, ranges.size());
+ outputData->Resize(outputSize);
+
+ auto outputRawData =
+ static_cast<char*>(outputData->raw_mutable_data(data.meta()));
+ VLOG(1) << "Copying data";
+ size_t outputOffsetBytes = 0;
+ auto itemsize = data.meta().itemsize();
+ for (int i = 0; i < ranges.size(); i += 2) {
+ auto rangeStart = rangesData[i];
+ auto rangeLength = rangesData[i + 1];
+ if (!rangeLength) {
+ continue;
+ }
+ auto rangeSizeBytes = rangeLength * itemsize;
+ CAFFE_ENFORCE(outputOffsetBytes < outputSize * itemsize);
+ CAFFE_ENFORCE(rangeStart + rangeLength <= data.size());
+ VLOG(2) << "Performing copy for range i";
+ context_.template CopyItems<Context, Context>(
+ data.meta(),
+ rangeLength,
+ rawData + rangeStart * itemsize,
+ outputRawData + outputOffsetBytes);
+ outputOffsetBytes += rangeSizeBytes;
+ }
+ CAFFE_ENFORCE(outputOffsetBytes == outputSize * itemsize);
+ return true;
+ }
+
+ INPUT_TAGS(DATA, RANGES, LENGTHS);
+
+ private:
+ template <typename Index>
+ size_t accumulate(Index* ranges, size_t start, size_t end) {
+ size_t result = 0;
+ for (int i = start + 1; i < end; i += 2) {
+ result += ranges[i];
+ }
+ return result;
+ }
+};
+
// Since we just do copying, consider untemplating it on T and using raw_data()
/**
* Deduplicates input indices vector and optionally produces reverse remapping.
diff --git a/caffe2/operators/utility_ops_gpu.cc b/caffe2/operators/utility_ops_gpu.cc
index eed7ad3..270a1a3 100644
--- a/caffe2/operators/utility_ops_gpu.cc
+++ b/caffe2/operators/utility_ops_gpu.cc
@@ -6,23 +6,31 @@
REGISTER_CUDA_OPERATOR(Print, PrintOp<CUDAContext>);
REGISTER_CUDA_OPERATOR(Flatten, FlattenOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(FlattenToVec, FlattenToVecOp<CUDAContext>);
REGISTER_CUDA_OPERATOR(Alias, AliasOp<CUDAContext>);
REGISTER_CUDA_OPERATOR(ResizeLike, ResizeLikeOp<CUDAContext>);
REGISTER_CUDA_OPERATOR(Sum, SumOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(WeightedSum, WeightedSumOp<float, CUDAContext>);
// From whatever the current context, ensure the output is TensorCPU
-REGISTER_CUDA_OPERATOR(EnsureCPUOutput,
- CopyOp<CUDAContext, CPUContext, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+ EnsureCPUOutput,
+ CopyOp<CUDAContext, CPUContext, CUDAContext>);
+// From CPU, copy it to whatever the current context
+REGISTER_CUDA_OPERATOR(
+ CopyFromCPUInput,
+ CopyOp<CUDAContext, CUDAContext, CPUContext>);
+
// CopyGPUToCPU and CopyCPUToGPU should both be carried out in a cuda context,
// since gpu code will be involved.
-REGISTER_CUDA_OPERATOR(CopyGPUToCPU,
- CopyOp<CUDAContext, CPUContext, CUDAContext>);
-REGISTER_CUDA_OPERATOR(CopyCPUToGPU,
- CopyOp<CUDAContext, CUDAContext, CPUContext>);
+REGISTER_CUDA_OPERATOR(
+ CopyGPUToCPU,
+ CopyOp<CUDAContext, CPUContext, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+ CopyCPUToGPU,
+ CopyOp<CUDAContext, CUDAContext, CPUContext>);
// If we only specify Copy, we assume that it is a gpu to gpu copy - maybe
// involving different GPUs.
-REGISTER_CUDA_OPERATOR(Copy,
- CopyOp<CUDAContext, CUDAContext, CUDAContext>);
+REGISTER_CUDA_OPERATOR(Copy, CopyOp<CUDAContext, CUDAContext, CUDAContext>);
} // namespace
} // namespace caffe2
diff --git a/caffe2/python/cnn.py b/caffe2/python/cnn.py
index 7a04ba2..6cf3fc8 100644
--- a/caffe2/python/cnn.py
+++ b/caffe2/python/cnn.py
@@ -1,10 +1,11 @@
from caffe2.python import core
+from caffe2.python.model_helper import ModelHelperBase
from caffe2.proto import caffe2_pb2
import logging
-class CNNModelHelper(object):
+class CNNModelHelper(ModelHelperBase):
"""A helper model so we can write CNN models more easily, without having to
manually define parameter initializations and operators separately.
"""
@@ -12,40 +13,20 @@
def __init__(self, order="NCHW", name=None,
use_cudnn=True, cudnn_exhaustive_search=False,
ws_nbytes_limit=None, init_params=True):
- if name is None:
- name = "CNN"
- self.net = core.Net(name)
- self.param_init_net = core.Net(name + '_init')
- self.params = []
- self.param_to_grad = {}
+ super(CNNModelHelper, self).__init__(
+ name="CNN" if name is None else name, init_params=init_params)
+
self.weights = []
self.biases = []
self.order = order
self.use_cudnn = use_cudnn
self.cudnn_exhaustive_search = cudnn_exhaustive_search
self.ws_nbytes_limit = ws_nbytes_limit
- self.init_params = init_params
- self.gradient_ops_added = False
if self.order != "NHWC" and self.order != "NCHW":
raise ValueError(
"Cannot understand the CNN storage order %s." % self.order
)
- def Proto(self):
- return self.net.Proto()
-
- def InitProto(self):
- return self.param_init_net.Proto()
-
- def RunAllOnGPU(self, *args, **kwargs):
- self.param_init_net.RunAllOnGPU(*args, **kwargs)
- self.net.RunAllOnGPU(*args, **kwargs)
-
- def CreateDB(self, blob_out, db, db_type, **kwargs):
- dbreader = self.param_init_net.CreateDB(
- [], blob_out, db=db, db_type=db_type, **kwargs)
- return dbreader
-
def ImageInput(
self, blob_in, blob_out, **kwargs
):
@@ -59,17 +40,6 @@
blob_in, blob_out, **kwargs)
return data, label
- def TensorProtosDBInput(
- self, unused_blob_in, blob_out, batch_size, db, db_type, **kwargs
- ):
- """TensorProtosDBInput."""
- dbreader_name = "dbreader_" + db
- dbreader = self.param_init_net.CreateDB(
- [], dbreader_name,
- db=db, db_type=db_type)
- return self.net.TensorProtosDBInput(
- dbreader, blob_out, batch_size=batch_size)
-
def Conv(
self, blob_in, blob_out, dim_in, dim_out, kernel, weight_init=None,
bias_init=None, **kwargs
@@ -237,8 +207,8 @@
)
return concat
- def FC(
- self, blob_in, blob_out, dim_in, dim_out, weight_init=None,
+ def _FC_or_packed_FC(
+ self, op_call, blob_in, blob_out, dim_in, dim_out, weight_init=None,
bias_init=None, **kwargs
):
"""FC"""
@@ -264,7 +234,15 @@
bias = core.ScopedBlobReference(
blob_out + '_b', self.param_init_net)
self.params.extend([weight, bias])
- return self.net.FC([blob_in, weight, bias], blob_out, **kwargs)
+ self.weights.append(weight)
+ self.biases.append(bias)
+ return op_call([blob_in, weight, bias], blob_out, **kwargs)
+
+ def FC(self, *args, **kwargs):
+ return self._FC_or_packed_FC(self.net.FC, *args, **kwargs)
+
+ def PackedFC(self, *args, **kwargs):
+ return self._FC_or_packed_FC(self.net.PackedFC, *args, **kwargs)
def FC_Decomp(
self, blob_in, blob_out, dim_in, dim_out,
@@ -431,7 +409,7 @@
"""Depth Concat."""
return self.net.Concat(
blobs_in,
- [blob_out, "_" + blob_out + "_condat_dims"],
+ [blob_out, "_" + blob_out + "_concat_dims"],
order=self.order,
**kwargs
)[0]
@@ -451,6 +429,10 @@
"""Transpose."""
return self.net.Transpose(blob_in, blob_out, **kwargs)
+ def Sum(self, blob_in, blob_out, **kwargs):
+ """Sum"""
+ return self.net.Sum(blob_in, blob_out, **kwargs)
+
def SpatialBN(self, blob_in, blob_out, dim_in, **kwargs):
blob_out = blob_out or self.net.NextName()
# Input: input, scale, bias, est_mean, est_inv_var
@@ -465,13 +447,15 @@
return self.param_init_net.ConstantFill(
[], blob_out + "_" + suffix, shape=[dim_in], value=value)
scale, bias = init_blob(1.0, "s"), init_blob(0.0, "b")
+ running_mean = init_blob(0.0, "rm")
+ running_inv_var = init_blob(1.0, "riv")
self.params.extend([scale, bias])
self.weights.append(scale)
self.biases.append(bias)
- blob_outs = [blob_out, blob_out + "_rm", blob_out + "_riv",
+ blob_outs = [blob_out, running_mean, running_inv_var,
blob_out + "_sm", blob_out + "_siv"]
blob_outputs = self.net.SpatialBN(
- [blob_in, scale, bias, blob_outs[1], blob_outs[2]], blob_outs,
+ [blob_in, scale, bias, running_mean, running_inv_var], blob_outs,
order=self.order, **kwargs)
# Return the output
return blob_outputs[0]
@@ -500,15 +484,22 @@
def ZeroInit(self):
return ('ConstantFill', {})
- def AddGradientOperators(self, *args, **kwargs):
- if self.gradient_ops_added:
- raise RuntimeError("You cannot run AddGradientOperators twice.")
- self.gradient_ops_added = True
- grad_map = self.net.AddGradientOperators(*args, **kwargs)
- for p in self.params:
- if str(p) in grad_map:
- self.param_to_grad[p] = grad_map[str(p)]
- return grad_map
+ def AddWeightDecay(self, weight_decay):
+ """Adds a decay to weights in the model.
+
+ This is a form of L2 regularization.
+
+ Args:
+ weight_decay: strength of the regularization
+ """
+ if weight_decay <= 0.0:
+ return
+ wd = self.param_init_net.ConstantFill([], 'wd', shape=[1],
+ value=weight_decay)
+ ONE = self.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0)
+ for param in self.weights:
+ # Equivalent to: grad += wd * param
+ self.net.WeightedSum([self.param_to_grad[param], ONE, param, wd])
@property
def CPU(self):
@@ -635,32 +626,3 @@
self.weights += step_net.weights
self.biases += step_net.biases
return output, hidden_state, cell_state
-
- def __getattr__(self, op_type):
- """Catch-all for all other operators, mostly those without params."""
- if not core.IsOperator(op_type):
- raise RuntimeError(
- 'Method ' + op_type + ' is not a registered operator.'
- )
- # known_working_ops are operators that do not need special care.
- known_working_ops = [
- "Accuracy",
- "Adam",
- "AveragedLoss",
- "Cast",
- "LabelCrossEntropy",
- "LearningRate",
- "Print",
- "Sigmoid",
- "Scale",
- "Snapshot",
- "Softmax",
- "StopGradient",
- "Summarize",
- "Tanh",
- "WeightedSum",
- ]
- if op_type not in known_working_ops:
- logging.warning("You are creating an op that the CNNModelHelper "
- "does not recognize: {}.".format(op_type))
- return self.net.__getattr__(op_type)
diff --git a/caffe2/python/control.py b/caffe2/python/control.py
new file mode 100644
index 0000000..9514c3a
--- /dev/null
+++ b/caffe2/python/control.py
@@ -0,0 +1,400 @@
+"""
+Implement functions for controlling execution of nets and steps, including
+ Do
+ DoParallel
+ For-loop
+ While-loop
+ Do-While-loop
+ Switch
+ If
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+
+
+def GetConditionBlobFromNet(condition_net):
+ """
+ The condition blob is the last external_output that must
+ be a single bool
+ """
+ assert len(condition_net.Proto().external_output) > 0, (
+ "Condition net %s must has at least one external output" %
+ condition_net.Proto.name)
+ # we need to use a blob reference here instead of a string
+ # otherwise, it will add another name_scope to the input later
+ # when we create new ops (such as OR of two inputs)
+ return core.BlobReference(condition_net.Proto().external_output[-1])
+
+def NotNet(condition_blob_or_net):
+ """Not of a condition blob or net
+
+ Args:
+ condition_blob_or_net can be either blob or net. If condition_blob_or_net
+ is Net, the condition is its last external_output
+ that must be a single bool.
+
+ returns
+ not_net: the net NOT the input
+ out_blob: the output blob of the not_net
+ """
+ if isinstance(condition_blob_or_net, core.Net):
+ condition_blob = GetConditionBlobFromNet(condition_blob_or_net)
+ else:
+ condition_blob = condition_blob_or_net
+
+ not_net = core.Net('not_net')
+ out_blob = not_net.Not(condition_blob)
+ not_net.AddExternalOutput(out_blob)
+
+ return not_net, out_blob
+
+
+def _CopyConditionBlobNet(condition_blob):
+ """Make a condition net that copies the condition_blob
+
+ Args:
+ condition_blob is a single bool.
+
+ returns
+ not_net: the net NOT the input
+ out_blob: the output blob of the not_net
+ """
+ condition_net = core.Net('copy_condition_blob_net')
+ out_blob = condition_net.Copy(condition_blob)
+ condition_net.AddExternalOutput(out_blob)
+
+ return condition_net, out_blob
+
+
+def MergeConditionNets(name, condition_nets, relation):
+ """
+ Merge multi condition nets into a single condition nets.
+
+ Args:
+ name: name of the new condition net.
+ condition_nets: a list of condition nets. The last external_output
+ of each condition net must be single bool value.
+ relation: can be 'And' or 'Or'.
+
+ Returns:
+ - A new condition net. Its last external output is relation of all
+ condition_nets.
+ """
+ if not isinstance(condition_nets, list):
+ return condition_nets
+ if len(condition_nets) <= 1:
+ return condition_nets[0] if condition_nets else None
+
+ merged_net = core.Net(name)
+ for i in range(len(condition_nets)):
+ net_proto = condition_nets[i].Proto()
+ assert net_proto.device_option == merged_net.Proto().device_option
+ assert net_proto.type == merged_net.Proto().type
+ merged_net.Proto().op.extend(net_proto.op)
+ merged_net.Proto().external_input.extend(net_proto.external_input)
+ # discard external outputs as we're combining them together
+ curr_cond = GetConditionBlobFromNet(condition_nets[i])
+ if i == 0:
+ last_cond = curr_cond
+ else:
+ last_cond = merged_net.__getattr__(relation)([last_cond, curr_cond])
+
+ merged_net.AddExternalOutput(last_cond)
+
+ return merged_net
+
+
+def Do(*nets_or_steps):
+ """
+ Execute the sequence of nets or steps once.
+
+ Examples:
+ - Do(net1, net2, ..., net_n)
+ - Do(list_of_nets)
+ - Do(step1, step2, ..., step_n)
+ - Do(list_of_steps)
+ """
+ if len(nets_or_steps) == 0:
+ raise ValueError(
+ 'nets_or_steps cannot be empty.')
+ elif len(nets_or_steps) == 1:
+ nets_or_steps = nets_or_steps[0]
+ else:
+ nets_or_steps = list(nets_or_steps)
+
+ return core.execution_step('Do', nets_or_steps)
+
+
+def DoParallel(*nets_or_steps):
+ """
+ Execute the nets or steps in parallel, waiting for all of them to finish
+
+ Examples:
+ - DoParallel(net1, net2, ..., net_n)
+ - DoParallel(list_of_nets)
+ - DoParallel(step1, step2, ..., step_n)
+ - DoParallel(list_of_steps)
+ """
+ if len(nets_or_steps) == 0:
+ raise ValueError(
+ 'nets_or_steps cannot be empty.')
+ elif len(nets_or_steps) == 1:
+ nets_or_steps = nets_or_steps[0]
+ else:
+ nets_or_steps = list(nets_or_steps)
+
+ return core.execution_step(
+ 'DoParallel', nets_or_steps, concurrent_substeps=True)
+
+
+def _StopNet(stop_blob):
+ stop_net = core.Net('stop_net')
+ stop_net.ConstantFill(
+ [], [stop_blob], shape=[], value=True, dtype=core.DataType.BOOL)
+ return stop_net
+
+
+def _ToExecutionStep(net_or_step):
+ if isinstance(net_or_step, core.Net):
+ return Do(net_or_step)
+ elif isinstance(net_or_step, core.ExecutionStep):
+ return net_or_step
+ else:
+ raise ValueError(
+ 'net_or_step must be a net or a step.')
+
+
+def _RunOnceIf(condition_blob_or_net, net_or_step):
+ """
+ Execute net_or_step once if condition_blob_or_net evaluates as true.
+
+ If condition_blob_or_net is Net, the condition is its last external_output
+ that must be a single bool. And this net will be executed before net_or_step
+ so as to get the condition.
+ """
+ if isinstance(condition_blob_or_net, core.Net):
+ condition_blob = GetConditionBlobFromNet(condition_blob_or_net)
+ return Do(Do(condition_blob_or_net),
+ _RunOnceIf(condition_blob, net_or_step))
+
+ stop_if_not_net, stop_blob = NotNet(condition_blob_or_net)
+ stop_net = _StopNet(stop_blob)
+
+ return core.execution_step(
+ '_RunOnceIf',
+ [Do(stop_if_not_net), _ToExecutionStep(net_or_step), Do(stop_net)],
+ should_stop_blob=stop_blob)
+
+
+def _RunOnceIfNot(condition_blob_or_net, net_or_step):
+ """
+ Similar to _RunOnceIf() but Execute net_or_step once if
+ condition_blob_or_net evaluates as false.
+ """
+ if isinstance(condition_blob_or_net, core.Net):
+ condition_blob = GetConditionBlobFromNet(condition_blob_or_net)
+ return Do(Do(condition_blob_or_net),
+ _RunOnceIfNot(condition_blob, net_or_step))
+
+ stop_if_net, stop_blob = _CopyConditionBlobNet(condition_blob_or_net)
+ stop_net = _StopNet(stop_blob)
+
+ return core.execution_step(
+ '_RunOnceIfNot',
+ [Do(stop_if_net), _ToExecutionStep(net_or_step), Do(stop_net)],
+ should_stop_blob=stop_blob)
+
+
+def For(net_or_step, iter_num):
+ """
+ Execute net_or_step iter_num times.
+
+ Args:
+ net_or_step: an instance of a ExecutionStep or a Net.
+ iter_num: the number times to execute the net_or_step.
+
+ Returns:
+ A ExecutionStep instance.
+ """
+ init_net = core.Net('init-net')
+ iter_cnt = init_net.CreateCounter([], init_count=iter_num)
+ iter_net = core.Net('For-iter')
+ iter_done = iter_net.CountDown([iter_cnt])
+
+ if isinstance(net_or_step, core.Net):
+ for_step = core.execution_step(
+ 'For', [iter_net, net_or_step], should_stop_blob=iter_done)
+ elif isinstance(net_or_step, core.ExecutionStep):
+ for_step = core.execution_step(
+ 'For', [Do(iter_net), net_or_step], should_stop_blob=iter_done)
+ else:
+ raise ValueError(
+ 'net_or_step must be a net or a step.')
+
+ return Do(Do(init_net), for_step)
+
+
+def While(condition_blob_or_net, net_or_step):
+ """
+ Execute net_or_step when condition_blob_or_net returns true.
+
+ Args:
+ condition_blob_or_net: If it is an instance of Net, its last
+ external_output must be a single bool.
+ net_or_step: an instance of a ExecutionStep or a Net.
+
+ Returns:
+ A ExecutionStep instance.
+ """
+ condition_not_net, stop_blob = NotNet(condition_blob_or_net)
+ if isinstance(condition_blob_or_net, core.Net):
+ condition_step = Do(condition_blob_or_net, condition_not_net)
+ else:
+ condition_step = Do(condition_not_net)
+
+ return core.execution_step(
+ 'While',
+ [condition_step, _ToExecutionStep(net_or_step)],
+ should_stop_blob=stop_blob)
+
+
+def Until(condition_blob_or_net, net_or_step):
+ """
+ Similar to While() but execute net_or_step when
+ condition_blob_or_net returns false
+ """
+ if isinstance(condition_blob_or_net, core.Net):
+ stop_blob = GetConditionBlobFromNet(condition_blob_or_net)
+ condition_step = Do(condition_blob_or_net)
+ else:
+ copy_net, stop_blob = _CopyConditionBlobNet(condition_blob_or_net)
+ condition_step = Do(copy_net)
+
+ return core.execution_step(
+ 'Until',
+ [condition_step, _ToExecutionStep(net_or_step)],
+ should_stop_blob=stop_blob)
+
+
+def DoWhile(condition_blob_or_net, net_or_step):
+ """
+ Execute net_or_step when condition_blob_or_net returns true. It will execute
+ net_or_step at least once.
+
+ Args:
+ condition_blob_or_net: if it is an instance of Net, tts last external_output
+ must be a single bool.
+ net_or_step: an instance of a ExecutionStep or a Net.
+
+ Returns:
+ A ExecutionStep instance.
+ """
+ condition_not_net, stop_blob = NotNet(condition_blob_or_net)
+ if isinstance(condition_blob_or_net, core.Net):
+ condition_step = Do(condition_blob_or_net, condition_not_net)
+ else:
+ condition_step = Do(condition_not_net)
+
+ return core.execution_step(
+ 'DoWhile',
+ [_ToExecutionStep(net_or_step), condition_step],
+ should_stop_blob=stop_blob)
+
+
+def DoUntil(condition_blob_or_net, net_or_step):
+ """
+ Similar to DoWhile() but execute net_or_step when
+ condition_blob_or_net returns false
+ """
+ steps = [_ToExecutionStep(net_or_step)]
+
+ if isinstance(condition_blob_or_net, core.Net):
+ steps.append(Do(condition_blob_or_net))
+ stop_blob = GetConditionBlobFromNet(condition_blob_or_net)
+ else:
+ stop_blob = condition_blob_or_net
+
+ stop_blob = core.BlobReference(str(stop_blob))
+ return core.execution_step('DoUntil', steps, should_stop_blob=stop_blob)
+
+
+def Switch(*conditions):
+ """
+ Execute the steps for which the condition is true.
+ Each condition is a tuple (condition_blob_or_net, step).
+ Note:
+ 1. Multi steps can be executed if their conditions are true.
+ 2. The conditions_blob_or_net (if it is Net) of all steps will be
+ executed once.
+
+ Examples:
+ - Switch((cond_1, net_1), (cond_2, net_2), ..., (cond_n, net_n))
+ - Switch([(cond_1, net1), (cond_2, net_2), ..., (cond_n, net_n)])
+ - Switch((cond_1, net_1))
+ """
+ if len(conditions) == 0:
+ raise ValueError(
+ 'conditions cannot be empty.')
+ elif len(conditions) == 1:
+ conditions = conditions[0]
+ if not isinstance(conditions, list):
+ conditions = [conditions]
+ else:
+ conditions = list(conditions)
+
+ return core.execution_step(
+ 'Switch', [_RunOnceIf(cond, step) for cond, step in conditions])
+
+
+def If(condition_blob_or_net, true_net_or_step, false_net_or_step=None):
+ """
+ condition_blob_or_net is first evaluated or executed. If the condition is
+ true, true_net_or_step is then executed, otherwise, false_net_or_step
+ is executed.
+
+ If condition_blob_or_net is Net, the condition is its last external_output
+ that must be a single bool. And this Net will be executred before both
+ true/false_net_or_step so as to get the condition.
+ """
+ if not false_net_or_step:
+ return _RunOnceIf(condition_blob_or_net, true_net_or_step)
+
+ if isinstance(condition_blob_or_net, core.Net):
+ condition_blob = GetConditionBlobFromNet(condition_blob_or_net)
+ return Do(Do(condition_blob_or_net),
+ If(condition_blob, true_net_or_step, false_net_or_step))
+
+ condition_blob = condition_blob_or_net
+ not_net, _ = NotNet(condition_blob)
+
+ return Switch(
+ (condition_blob, true_net_or_step),
+ (not_net, false_net_or_step),
+ )
+
+
+def IfNot(condition_blob_or_net, true_net_or_step, false_net_or_step=None):
+ """
+ If condition_blob_or_net returns false, executes true_net_or_step,
+ otherwise executes false_net_or_step
+ """
+ if not false_net_or_step:
+ return _RunOnceIfNot(condition_blob_or_net, true_net_or_step)
+
+ if isinstance(condition_blob_or_net, core.Net):
+ condition_blob = GetConditionBlobFromNet(condition_blob_or_net)
+ return Do(Do(condition_blob_or_net),
+ IfNot(condition_blob, true_net_or_step, false_net_or_step))
+
+ condition_blob = condition_blob_or_net
+ not_net, _ = NotNet(condition_blob)
+
+ return Switch(
+ (condition_blob, false_net_or_step),
+ (not_net, true_net_or_step),
+ )
diff --git a/caffe2/python/control_test.py b/caffe2/python/control_test.py
new file mode 100644
index 0000000..066f7a6
--- /dev/null
+++ b/caffe2/python/control_test.py
@@ -0,0 +1,217 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import control, core, test_util, workspace
+
+import logging
+logger = logging.getLogger(__name__)
+
+
+class TestControl(test_util.TestCase):
+ def setUp(self):
+ super(TestControl, self).setUp()
+ self.N_ = 10
+
+ self.init_net_ = core.Net("init-net")
+ cnt = self.init_net_.CreateCounter([], init_count=0)
+ const_n = self.init_net_.ConstantFill(
+ [], shape=[], value=self.N_, dtype=core.DataType.INT64)
+ const_0 = self.init_net_.ConstantFill(
+ [], shape=[], value=0, dtype=core.DataType.INT64)
+
+ self.cnt_net_ = core.Net("cnt-net")
+ self.cnt_net_.CountUp([cnt])
+ curr_cnt = self.cnt_net_.RetrieveCount([cnt])
+ self.init_net_.ConstantFill(
+ [], [curr_cnt], shape=[], value=0, dtype=core.DataType.INT64)
+ self.cnt_net_.AddExternalOutput(curr_cnt)
+
+ self.cond_net_ = core.Net("cond-net")
+ cond_blob = self.cond_net_.LT([curr_cnt, const_n])
+ self.cond_net_.AddExternalOutput(cond_blob)
+
+ self.not_cond_net_ = core.Net("not-cond-net")
+ cond_blob = self.not_cond_net_.GE([curr_cnt, const_n])
+ self.not_cond_net_.AddExternalOutput(cond_blob)
+
+ self.true_cond_net_ = core.Net("true-cond-net")
+ true_blob = self.true_cond_net_.LT([const_0, const_n])
+ self.true_cond_net_.AddExternalOutput(true_blob)
+
+ self.false_cond_net_ = core.Net("false-cond-net")
+ false_blob = self.false_cond_net_.GT([const_0, const_n])
+ self.false_cond_net_.AddExternalOutput(false_blob)
+
+ def CheckNetOutput(self, nets_and_expects):
+ """
+ Check the net output is expected
+ nets_and_expects is a list of tuples (net, expect)
+ """
+ for net, expect in nets_and_expects:
+ output = workspace.FetchBlob(
+ net.Proto().external_output[-1])
+ self.assertEqual(output, expect)
+
+ def BuildAndRunPlan(self, step):
+ plan = core.Plan("test")
+ plan.AddStep(control.Do(self.init_net_))
+ plan.AddStep(step)
+ self.assertEqual(workspace.RunPlan(plan), True)
+
+ def ForLoopTest(self, net_or_step):
+ step = control.For(net_or_step, self.N_)
+ self.BuildAndRunPlan(step)
+ self.CheckNetOutput([(self.cnt_net_, self.N_)])
+
+ def testForLoopWithNet(self):
+ self.ForLoopTest(self.cnt_net_)
+
+ def testForLoopWithStep(self):
+ step = control.Do(self.cnt_net_)
+ self.ForLoopTest(step)
+
+ def WhileLoopTest(self, net_or_step):
+ step = control.While(self.cond_net_, net_or_step)
+ self.BuildAndRunPlan(step)
+ self.CheckNetOutput([(self.cnt_net_, self.N_)])
+
+ def testWhileLoopWithNet(self):
+ self.WhileLoopTest(self.cnt_net_)
+
+ def testWhileLoopWithStep(self):
+ step = control.Do(self.cnt_net_)
+ self.WhileLoopTest(step)
+
+ def UntilLoopTest(self, net_or_step):
+ step = control.Until(self.not_cond_net_, net_or_step)
+ self.BuildAndRunPlan(step)
+ self.CheckNetOutput([(self.cnt_net_, self.N_)])
+
+ def testUntilLoopWithNet(self):
+ self.UntilLoopTest(self.cnt_net_)
+
+ def testUntilLoopWithStep(self):
+ step = control.Do(self.cnt_net_)
+ self.UntilLoopTest(step)
+
+ def DoWhileLoopTest(self, net_or_step):
+ step = control.DoWhile(self.cond_net_, net_or_step)
+ self.BuildAndRunPlan(step)
+ self.CheckNetOutput([(self.cnt_net_, self.N_)])
+
+ def testDoWhileLoopWithNet(self):
+ self.DoWhileLoopTest(self.cnt_net_)
+
+ def testDoWhileLoopWithStep(self):
+ step = control.Do(self.cnt_net_)
+ self.DoWhileLoopTest(step)
+
+ def DoUntilLoopTest(self, net_or_step):
+ step = control.DoUntil(self.not_cond_net_, net_or_step)
+ self.BuildAndRunPlan(step)
+ self.CheckNetOutput([(self.cnt_net_, self.N_)])
+
+ def testDoUntilLoopWithNet(self):
+ self.DoUntilLoopTest(self.cnt_net_)
+
+ def testDoUntilLoopWithStep(self):
+ step = control.Do(self.cnt_net_)
+ self.DoUntilLoopTest(step)
+
+ def IfCondTest(self, cond_net, expect, cond_on_blob):
+ if cond_on_blob:
+ step = control.Do(
+ control.Do(cond_net),
+ control.If(cond_net.Proto().external_output[-1],
+ self.cnt_net_))
+ else:
+ step = control.If(cond_net, self.cnt_net_)
+ self.BuildAndRunPlan(step)
+ self.CheckNetOutput([(self.cnt_net_, expect)])
+
+ def testIfCondTrueOnNet(self):
+ self.IfCondTest(self.true_cond_net_, 1, False)
+
+ def testIfCondTrueOnBlob(self):
+ self.IfCondTest(self.true_cond_net_, 1, True)
+
+ def testIfCondFalseOnNet(self):
+ self.IfCondTest(self.false_cond_net_, 0, False)
+
+ def testIfCondFalseOnBlob(self):
+ self.IfCondTest(self.false_cond_net_, 0, True)
+
+ def IfElseCondTest(self, cond_net, expect, cond_on_blob):
+ true_step = control.For(self.cnt_net_, self.N_)
+ false_step = control.For(self.cnt_net_, 2 * self.N_)
+ if cond_on_blob:
+ step = control.Do(
+ control.Do(cond_net),
+ control.If(cond_net.Proto().external_output[-1],
+ true_step, false_step))
+ else:
+ step = control.If(cond_net, true_step, false_step)
+ self.BuildAndRunPlan(step)
+ self.CheckNetOutput([(self.cnt_net_, expect)])
+
+ def testIfElseCondTrueOnNet(self):
+ self.IfElseCondTest(self.true_cond_net_, self.N_, False)
+
+ def testIfElseCondTrueOnBlob(self):
+ self.IfElseCondTest(self.true_cond_net_, self.N_, True)
+
+ def testIfElseCondFalseOnNet(self):
+ self.IfElseCondTest(self.false_cond_net_, 2 * self.N_, False)
+
+ def testIfElseCondFalseOnBlob(self):
+ self.IfElseCondTest(self.false_cond_net_, 2 * self.N_, True)
+
+ def IfNotCondTest(self, cond_net, expect, cond_on_blob):
+ if cond_on_blob:
+ step = control.Do(
+ control.Do(cond_net),
+ control.IfNot(cond_net.Proto().external_output[-1],
+ self.cnt_net_))
+ else:
+ step = control.IfNot(cond_net, self.cnt_net_)
+ self.BuildAndRunPlan(step)
+ self.CheckNetOutput([(self.cnt_net_, expect)])
+
+ def testIfNotCondTrueOnNet(self):
+ self.IfNotCondTest(self.true_cond_net_, 0, False)
+
+ def testIfNotCondTrueOnBlob(self):
+ self.IfNotCondTest(self.true_cond_net_, 0, True)
+
+ def testIfNotCondFalseOnNet(self):
+ self.IfNotCondTest(self.false_cond_net_, 1, False)
+
+ def testIfNotCondFalseOnBlob(self):
+ self.IfNotCondTest(self.false_cond_net_, 1, True)
+
+ def IfNotElseCondTest(self, cond_net, expect, cond_on_blob):
+ true_step = control.For(self.cnt_net_, self.N_)
+ false_step = control.For(self.cnt_net_, 2 * self.N_)
+ if cond_on_blob:
+ step = control.Do(
+ control.Do(cond_net),
+ control.IfNot(cond_net.Proto().external_output[-1],
+ true_step, false_step))
+ else:
+ step = control.IfNot(cond_net, true_step, false_step)
+ self.BuildAndRunPlan(step)
+ self.CheckNetOutput([(self.cnt_net_, expect)])
+
+ def testIfNotElseCondTrueOnNet(self):
+ self.IfNotElseCondTest(self.true_cond_net_, 2 * self.N_, False)
+
+ def testIfNotElseCondTrueOnBlob(self):
+ self.IfNotElseCondTest(self.true_cond_net_, 2 * self.N_, True)
+
+ def testIfNotElseCondFalseOnNet(self):
+ self.IfNotElseCondTest(self.false_cond_net_, self.N_, False)
+
+ def testIfNotElseCondFalseOnBlob(self):
+ self.IfNotElseCondTest(self.false_cond_net_, self.N_, True)
diff --git a/caffe2/python/convnet_benchmarks.py b/caffe2/python/convnet_benchmarks.py
index b4f3d5b..810a2bf 100644
--- a/caffe2/python/convnet_benchmarks.py
+++ b/caffe2/python/convnet_benchmarks.py
@@ -490,6 +490,17 @@
return model, 224
+def AddParameterUpdate(model):
+ """ Simple plain SGD update -- not tuned to actually train the models """
+ ITER = model.Iter("iter")
+ LR = model.LearningRate(
+ ITER, "LR", base_lr=-1e-8, policy="step", stepsize=10000, gamma=0.999)
+ ONE = model.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0)
+ for param in model.params:
+ param_grad = model.param_to_grad[param]
+ model.WeightedSum([param, ONE, param_grad, LR], param)
+
+
def Benchmark(model_gen, arg):
model, input_size = model_gen(arg.order)
model.Proto().type = arg.net_type
@@ -524,6 +535,7 @@
else:
print('{}: running forward-backward.'.format(arg.model))
model.AddGradientOperators(["loss"])
+ AddParameterUpdate(model)
if arg.order == 'NHWC':
print(
'==WARNING==\n'
diff --git a/caffe2/python/core.py b/caffe2/python/core.py
index 9d42cad..81147c5 100644
--- a/caffe2/python/core.py
+++ b/caffe2/python/core.py
@@ -57,7 +57,7 @@
return (op_type + "_ENGINE_" + engine in _REGISTERED_OPERATORS)
-def DeviceOption(device_type, cuda_gpu_id, random_seed=None):
+def DeviceOption(device_type, cuda_gpu_id=0, random_seed=None):
option = caffe2_pb2.DeviceOption()
option.device_type = device_type
option.cuda_gpu_id = cuda_gpu_id
@@ -106,6 +106,9 @@
def __str__(self):
return self._name
+ def __repr__(self):
+ return 'BlobReference("{}")'.format(self._name)
+
def __add__(self, other):
if not isinstance(other, basestring):
raise RuntimeError('Cannot add BlobReference to a non-string.')
@@ -492,10 +495,17 @@
if (len(input_usage) <= 1 or fwd_op_idx != input_usage[0]):
# We do not need to do gradient accumulation yet.
continue
-
generator = self.gradient_generators[input_name][input_version]
- if not self._VerifyGradientGenerators(generator):
- continue
+ try:
+ if not self._VerifyGradientGenerators(generator):
+ continue
+ except RuntimeError as err:
+ raise RuntimeError(
+ "Gradients for param ''{}'' failed to verity: {}".format(
+ input_name,
+ err
+ )
+ )
# Finally, let's create the sum operator.
sum_op = self._MakeSumOp(input_name, input_version)
@@ -1125,6 +1135,24 @@
return netlike
+def output_to_list(op_output):
+ """
+ Ensures that the output of an operator is a list.
+ Use when an operator has a variable number of outputs, but a list of
+ outputs is desired even when number of outputs is 1.
+
+ Args:
+ op_output: Either a BlobReferenece or an iterable of BlobReferences.
+
+ Returns:
+ A list of BlobReferences.
+ """
+ assert type(op_output) in (list, tuple, BlobReference)
+ return (
+ [op_output]
+ if isinstance(op_output, BlobReference) else list(op_output))
+
+
def _add_net_to_dict(net_dict, net):
name = get_net_name(net)
if net in net_dict:
@@ -1152,6 +1180,9 @@
if num_iter is not None:
self._step.num_iter = num_iter
+ def Name(self):
+ return self._step.name
+
def __str__(self):
return self._step.name
@@ -1227,10 +1258,16 @@
class Plan(object):
- def __init__(self, name):
+ def __init__(self, name_or_step):
self._plan = caffe2_pb2.PlanDef()
- self._plan.name = name
self._net_dict = OrderedDict()
+ if isinstance(name_or_step, ExecutionStep):
+ self._plan.name = name_or_step.Name()
+ self.AddStep(name_or_step)
+ elif isinstance(name_or_step, basestring):
+ self._plan.name = name_or_step
+ else:
+ raise ValueError('name_or_step must be a string or ExecutionStep')
def __str__(self):
return self._plan.name
diff --git a/caffe2/python/data_parallel_model.py b/caffe2/python/data_parallel_model.py
new file mode 100644
index 0000000..d094840
--- /dev/null
+++ b/caffe2/python/data_parallel_model.py
@@ -0,0 +1,407 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from types import FunctionType
+from functools import wraps
+import six
+
+from caffe2.python import cnn, dyndep, scope, workspace, core
+from caffe2.proto import caffe2_pb2
+
+dyndep.InitOpsLibrary("@/caffe2/caffe2/contrib/nccl:nccl_ops")
+
+
+DATAPARALLEL_OPS = [
+ "Conv",
+ "ConvTranspose",
+ "GroupConv",
+ "FC",
+ "FC_Decomp",
+ "FC_Prune",
+ "FC_Sparse",
+ "LRN",
+ "Dropout",
+ "MaxPool",
+ "AveragePool",
+ "Concat",
+ "DepthConcat",
+ "Relu",
+ "Transpose",
+ "SpatialBN",
+ "Accuracy",
+ "Adam",
+ "AveragedLoss",
+ "Cast",
+ "LabelCrossEntropy",
+ "LearningRate",
+ "Print",
+ "Scale",
+ "Snapshot",
+ "Softmax",
+ "StopGradient",
+ "Summarize",
+ "Sum",
+ "Tanh",
+ "WeightedSum",
+ "SquaredL2Distance",
+]
+
+
+class _GPUDataParallelMetaClass(type):
+ """A meta class to patch method in order to distribute them over multiple
+ GPUs.
+ """
+ _devices = []
+
+ @staticmethod
+ def _data_parallel_wrapper(op):
+ @wraps(op)
+ def wrapped(cls, blob_in, blob_out, *args, **kwargs):
+ # Helpers to extract a device specific blob or a global blob
+ def self_or_item(d, key):
+ if isinstance(d, dict):
+ assert key in d
+ return d[key]
+ return d
+
+ def get_input(gpu_id):
+ if isinstance(blob_in, list):
+ return [self_or_item(blob, gpu_id) for blob in blob_in]
+ return self_or_item(blob_in, gpu_id)
+
+ def get_output(gpu_id):
+ return self_or_item(blob_out, gpu_id)
+
+ # If we have explicit device scope, we do not parallelize
+ if cls.explicit_scope():
+ return op(
+ cls,
+ blob_in,
+ blob_out,
+ *args,
+ **kwargs)
+
+ devices = _GPUDataParallelMetaClass._devices
+ results = {}
+ for gpu_id in devices:
+ with core.NameScope("gpu_{}".format(gpu_id)):
+ device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
+ with core.DeviceScope(device):
+ result = op(
+ cls,
+ get_input(gpu_id),
+ get_output(gpu_id),
+ *args,
+ **kwargs)
+ results[gpu_id] = result
+ return results
+
+ return wrapped
+
+ def __new__(meta, classname, bases, class_dict):
+ assert len(bases) == 1, "Expects only one base class"
+ base = bases[0]
+ assert base is cnn.CNNModelHelper, "Base class should be CNNModelHelper"
+ new_class_dict = {}
+ for name, attr in base.__dict__.items():
+ if name not in DATAPARALLEL_OPS:
+ continue
+ attr = _GPUDataParallelMetaClass._data_parallel_wrapper(attr)
+ new_class_dict[name] = attr
+ for name, attr in class_dict.items():
+ if name in new_class_dict:
+ continue
+ if isinstance(attr, FunctionType):
+ if name in DATAPARALLEL_OPS:
+ new_class_dict[name] = \
+ _GPUDataParallelMetaClass._data_parallel_wrapper(attr)
+ else:
+ new_class_dict[name] = attr
+ return super(_GPUDataParallelMetaClass, meta).__new__(
+ meta, classname, bases, new_class_dict)
+
+
+@six.add_metaclass(_GPUDataParallelMetaClass)
+class GPUDataParallelModel(cnn.CNNModelHelper):
+ """A helper class that extends CNNModelHelper to support multi GPUs
+ data parallel training.
+ """
+ def __init__(self, devices, *args, **kwargs):
+ assert len(devices) >= 1, "Should have at least 1 GPU devices"
+ assert len(devices) <= workspace.NumCudaDevices(), \
+ "Requested number of devices is greater than the number of GPUs"
+ _GPUDataParallelMetaClass._devices = devices
+ self._devices = devices
+ self._explicit_scope = False
+ self._gradient_reduce_all_added = False
+ super(GPUDataParallelModel, self).__init__(*args, **kwargs)
+
+ def explicit_scope(self):
+ return self._explicit_scope
+
+ def _call(self, name, *args, **kwargs):
+ return super(GPUDataParallelModel, self).__getattr__(
+ name)(*args, **kwargs)
+
+ # TODO(denisy): try out decorators to avoid this code below
+ def Accuracy(self, *args, **kwargs):
+ return self._call("Accuracy", *args, **kwargs)
+
+ def Adam(self, *args, **kwargs):
+ return self._call("Adam", *args, **kwargs)
+
+ def AveragedLoss(self, *args, **kwargs):
+ return self._call("AveragedLoss", *args, **kwargs)
+
+ def Cast(self, *args, **kwargs):
+ return self._call("Cast", *args, **kwargs)
+
+ def LabelCrossEntropy(self, *args, **kwargs):
+ return self._call("LabelCrossEntropy", *args, **kwargs)
+
+ def LearningRate(self, *args, **kwargs):
+ return self._call("LearningRate", *args, **kwargs)
+
+ def Print(self, *args, **kwargs):
+ return self._call("Print", *args, **kwargs)
+
+ def Scale(self, *args, **kwargs):
+ return self._call("Scale", *args, **kwargs)
+
+ def Snapshot(self, *args, **kwargs):
+ return self._call("Snapshot", *args, **kwargs)
+
+ def Softmax(self, *args, **kwargs):
+ return self._call("Softmax", *args, **kwargs)
+
+ def StopGradient(self, *args, **kwargs):
+ return self._call("StopGradient", *args, **kwargs)
+
+ def Sum(self, *args, **kwargs):
+ return self._call("Sum", *args, **kwargs)
+
+ def Summarize(self, *args, **kwargs):
+ return self._call("Summarize", *args, **kwargs)
+
+ def Tanh(self, *args, **kwargs):
+ return self._call("Tanh", *args, **kwargs)
+
+ def WeightedSum(self, *args, **kwargs):
+ return self._call("WeightedSum", *args, **kwargs)
+
+ def SquaredL2Distance(self, *args, **kwargs):
+ return self._call("SquaredL2Distance", *args, **kwargs)
+
+ def FinalizeSetup(self):
+ self.param_init_net.RunAllOnGPU()
+ self.RunAllOnGPU()
+
+ # Setup sync of initial params
+ self._SyncInitialParams()
+
+ def AddGradientOperators(self, params, *args, **kwargs):
+ def create_grad(param):
+ return self.ConstantFill(param, str(param) + "_grad", value=1.0)
+
+ param_grad = {}
+ # Explicitly need to create gradients on each GPU
+ for param in params:
+ if not isinstance(param, dict):
+ grad = create_grad(param)
+ param_grad[str(param)] = str(grad)
+ else:
+ for gpu_id in self._devices:
+ device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
+ with core.DeviceScope(device):
+ assert gpu_id in param
+ p = param[gpu_id]
+ g = create_grad(p)
+ param_grad[str(p)] = str(g)
+
+ return super(GPUDataParallelModel, self).AddGradientOperators(
+ param_grad, *args, **kwargs)
+
+ def AddWeightDecay(self, weight_decay):
+ if weight_decay == 0.0:
+ return
+
+ assert(weight_decay > 0.0)
+
+ self._explicit_scope = True
+ assert \
+ self._gradient_reduce_all_added, \
+ "Weight decay must be done after gradient sync between gpus"
+
+ for gpu_id in self._devices:
+ with core.NameScope("gpu_{}".format(gpu_id)):
+ device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
+ with core.DeviceScope(device):
+ wd = self.param_init_net.ConstantFill([], 'wd', shape=[1],
+ value=weight_decay)
+ ONE = self.param_init_net.ConstantFill([], "ONE", shape=[1],
+ value=1.0)
+ # Only update parameters that belong to the current GPU
+ params = self._CurrentScopeParams()
+
+ # Take only params that are weights
+ print("Adding weigth-decay for gpu {}.".format(gpu_id))
+
+ gpu_weights = [p for p in params if p in self.weights]
+ for w in gpu_weights:
+ # Equivalent to grad -= w * param
+ grad = self.param_to_grad[w]
+ self.net.WeightedSum([grad, ONE, w, wd], grad)
+
+ self._explicit_scope = False
+
+ def _SyncInitialParams(self):
+ # TODO(akyrola): replace with NCCLBroadcast when it's working
+ # This doesn't work right now:
+ # with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)):
+ # workspace.RunOperatorOnce(
+ # core.CreateOperator(
+ # 'NCCLBroadcast', model.params, model.params, root=0))
+ unique_param_names = set(
+ stripParamName(p)
+ for p in self.params
+ )
+
+ self._explicit_scope = True
+ # Copy params from gpu_0 to other
+ for param in unique_param_names:
+ for gpu_idx in self._devices[1:]:
+ device_opt = core.DeviceOption(caffe2_pb2.CUDA, gpu_idx)
+ with core.DeviceScope(device_opt):
+ self.param_init_net.Copy(
+ "gpu_{}/{}".format(self._devices[0], param),
+ "gpu_{}/{}".format(gpu_idx, param)
+ )
+ self._explicit_scope = False
+
+ def _AllReduceGradients(self):
+ """Performs NCCL AllReduce to distribute gradients to all the GPUs."""
+
+ self._gradient_reduce_all_added = True
+
+ if len(self._devices) == 1:
+ return
+
+ # Take only params that have gradient associated with them.
+ unique_grads_names = set(
+ stripParamName(grad)
+ for grad in self.param_to_grad.values()
+ )
+ # Now we need to Allreduce gradients on all the GPUs.
+ # Pick GPU #0 as a master GPU.
+ self._explicit_scope = True
+ with core.DeviceScope(
+ core.DeviceOption(caffe2_pb2.CUDA, self._devices[0])
+ ):
+ # Group by grads for reduce.
+ for grad_name in unique_grads_names:
+ grads_group = [
+ grad
+ for grad in self.param_to_grad.values()
+ if stripParamName(grad) == grad_name
+ ]
+ assert len(grads_group) == len(self._devices), \
+ "Each GPU from {}, should have a copy of {}.".format(
+ self._devices, grad_name)
+ self.NCCLAllreduce(grads_group, grads_group)
+ self._explicit_scope = False
+
+ def _BuildLR(self, base_lr, policy="fixed", **other_lr_params):
+ """A helper to create learning rate."""
+ ITER = self.Iter("ITER")
+ # There is one interesting thing here: since we are minimizing, we are
+ # doing "descent" so the learning rate is set to be negative.
+ LR = self.net.LearningRate(
+ [ITER],
+ "LR",
+ base_lr=base_lr,
+ policy=policy,
+ **other_lr_params
+ )
+ return LR
+
+ def _BuildSGD(self, params, base_lr, policy="fixed", **other_lr_params):
+ """A helper to construct gradient update for SGD."""
+ base_lr = base_lr / len(self._devices)
+ LR = self._BuildLR(base_lr, policy, **other_lr_params)
+ ONE = self.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0)
+ for param in params:
+ grad = self.param_to_grad[param]
+ if isinstance(grad, core.GradientSlice):
+ self.ScatterWeightedSum(
+ [param, ONE, grad.indices, grad.values, LR], param
+ )
+ else:
+ self.WeightedSum([param, ONE, grad, LR], param)
+
+ def _CurrentScopeParams(self):
+ return [
+ param
+ for param in self.param_to_grad.keys()
+ if str(param).startswith(scope.NAMESCOPE)
+ ]
+
+ def SGD(self, base_lr, policy="fixed", **other_lr_params):
+ """Adds SGD optimizer to the model."""
+ self._AllReduceGradients()
+
+ # Create update params operators.
+ self._explicit_scope = True
+ for gpu_id in self._devices:
+ with core.NameScope("gpu_{}".format(gpu_id)):
+ device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
+ with core.DeviceScope(device):
+ # Only update parameters that belong to the current GPU
+ params = self._CurrentScopeParams()
+
+ # Add optimizer update operators
+ self._BuildSGD(params, base_lr, policy, **other_lr_params)
+ self._explicit_scope = False
+
+ def CustomSGD(
+ self,
+ paramup_build_fn,
+ base_lr,
+ lr_policy,
+ weight_decay,
+ **other_lr_pars
+ ):
+ """Custom parameter update function"""
+ self._AllReduceGradients()
+
+ self.AddWeightDecay(weight_decay)
+
+ # Run parameter update on each machine
+ self._explicit_scope = True
+ for gpu_id in self._devices:
+ with core.NameScope("gpu_{}".format(gpu_id)):
+ device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
+ with core.DeviceScope(device):
+ LR = self._BuildLR(base_lr, lr_policy, **other_lr_pars)
+
+ params = self._CurrentScopeParams()
+ paramup_build_fn(self, params, LR)
+ self._explicit_scope = False
+
+ def ExecOnEachDevice(self, fn, *args, **kwargs):
+ self._explicit_scope = True
+ for gpu_id in self._devices:
+ with core.NameScope("gpu_{}".format(gpu_id)):
+ device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
+ with core.DeviceScope(device):
+ fn(self, *args, **kwargs)
+
+ self._explicit_scope = False
+
+
+# A helper function to extract a parameter's name
+def stripParamName(param):
+ # Format is "a/b/c/d" -> d
+ name = str(param)
+ sep = scope._NAMESCOPE_SEPARATOR
+ return name[name.rindex(sep) + 1:]
diff --git a/caffe2/python/data_parallel_model_test.py b/caffe2/python/data_parallel_model_test.py
new file mode 100644
index 0000000..653838d
--- /dev/null
+++ b/caffe2/python/data_parallel_model_test.py
@@ -0,0 +1,60 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import unittest
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core, workspace, data_parallel_model
+from caffe2.python.test_util import TestCase
+
+
+@unittest.skipIf(not workspace.has_gpu_support, "No gpu support.")
+@unittest.skipIf(workspace.NumCudaDevices() < 2, "Need at least 2 GPUs.")
+class GPUDataParallelModelTest(TestCase):
+ def test(self):
+ gpu_devices = [0, 1] # gpu ids
+ perfect_model = np.array([2, 6, 5, 0, 1]).astype(np.float32)
+ np.random.seed(123)
+ data = np.random.randint(
+ 2, size=(50, perfect_model.size)
+ ).astype(np.float32)
+ label = np.dot(data, perfect_model)[:, np.newaxis]
+
+ model = data_parallel_model.GPUDataParallelModel(
+ gpu_devices, order="NHWC", name="fake")
+
+ fc = model.FC("data", "fc", perfect_model.size, 1,
+ ("ConstantFill", {}), ("ConstantFill", {}), axis=0)
+ sq = model.SquaredL2Distance([fc, "label"], "sq")
+ loss = model.AveragedLoss(sq, "loss")
+ model.AddGradientOperators([loss])
+ model.SGD(-0.1)
+ model.RunAllOnGPU()
+
+ for gpu_id in gpu_devices:
+ with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, gpu_id)):
+ workspace.FeedBlob(
+ "gpu_{}/data".format(gpu_id), data[0])
+ workspace.FeedBlob(
+ "gpu_{}/label".format(gpu_id), label[0])
+
+ workspace.RunNetOnce(model.param_init_net)
+ workspace.CreateNet(model.net)
+
+ for i in range(2000):
+ idx = np.random.randint(data.shape[0])
+ for gpu_id in gpu_devices:
+ device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
+ with core.DeviceScope(device):
+ workspace.FeedBlob(
+ "gpu_{}/data".format(gpu_id), data[idx])
+ workspace.FeedBlob(
+ "gpu_{}/label".format(gpu_id), label[idx])
+ workspace.RunNet(model.net)
+
+ for gpu_id in gpu_devices:
+ np.testing.assert_allclose(
+ perfect_model[np.newaxis, :],
+ workspace.FetchBlob("gpu_{}/fc_w".format(gpu_id)),
+ atol=1e-2)
diff --git a/caffe2/python/dataio.py b/caffe2/python/dataio.py
index 6db919c..6878afc 100644
--- a/caffe2/python/dataio.py
+++ b/caffe2/python/dataio.py
@@ -36,6 +36,24 @@
assert self._schema is not None, 'Schema not provided for this reader.'
return self._schema
+ def setup_ex(self, init_net, finish_net):
+ """Nets to be executed once at startup and finish.
+ Experimental extension. Don't use yet"""
+ pass
+
+ def read_ex(self, local_init_net, local_finish_net):
+ """Experimental extension to the interface. Don't use yet"""
+ read_net = core.Net('reader_body')
+ return ([read_net], ) + self.read(read_net)
+
+ def read_record_ex(self, local_init_net, local_finish_net):
+ """Experimental extension to the interface. Don't use yet"""
+ nets, should_stop, fields = self.read_ex(
+ local_init_net, local_finish_net)
+ if self._schema:
+ fields = from_blob_list(self._schema, fields)
+ return nets, should_stop, fields
+
"""
Reader is a abstract class to be implemented in order to provide
operations capable of iterating through a dataset or stream of data.
@@ -151,10 +169,31 @@
fields = fields.field_blobs()
self.write(writer_net, fields)
+ def setup_ex(self, init_net, finish_net):
+ """Experimental, don't use yet"""
+ self.commit(finish_net)
+
+ def write_ex(self, fields, local_init_net, local_finish_net, stop_blob):
+ """Experimental extension to the interface. Don't use yet"""
+ write_net = core.Net('write_net')
+ self.write(write_net, fields)
+ return [write_net]
+
+ def write_record_ex(
+ self, fields, local_init_net, local_finish_net, stop_blob=None):
+ """Experimental extension to the interface. Don't use yet."""
+ if isinstance(fields, Field):
+ fields = fields.field_blobs()
+ if stop_blob is None:
+ stop_blob = local_init_net.NextName("dequeue_status")
+ write_nets = self.write_ex(
+ fields, local_init_net, local_finish_net, stop_blob)
+ return (write_nets, stop_blob)
+
def commit(self, finish_net):
"""Add operations to `finish_net` that signal end of data.
This must be implemented by all Writers, but may be no-op for some
of them.
"""
- raise NotImplementedError('Writers must implement commit.')
+ pass
diff --git a/caffe2/python/experiment_util.py b/caffe2/python/experiment_util.py
new file mode 100644
index 0000000..333ad7d
--- /dev/null
+++ b/caffe2/python/experiment_util.py
@@ -0,0 +1,52 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import datetime
+import time
+
+from collections import OrderedDict
+
+'''
+Utilities for logging experiment run stats, such as accuracy
+and loss over time for different runs. Runtime arguments are stored
+in the log.
+'''
+
+
+class ModelTrainerLog():
+
+ def __init__(self, expname, runtime_args):
+ now = datetime.datetime.fromtimestamp(time.time())
+ self.experiment_id = now.strftime('%Y%m%d_%H%M%S')
+ self.filename = "%s_%s.log" % (expname, self.experiment_id)
+ self.logstr("# %s" % str(runtime_args))
+ self.headers = None
+ self.start_time = time.time()
+
+ def logstr(self, str):
+ with open(self.filename, "a") as f:
+ f.write(str + "\n")
+ f.close()
+ print(str)
+
+ def log(self, input_count, batch_count, additional_values):
+ logdict = OrderedDict()
+ logdict['time'] = time.time() - self.start_time
+ logdict['input_counter'] = input_count
+ logdict['batch_count'] = batch_count
+ if logdict['time'] > 0:
+ logdict['inputs_per_sec'] = input_count / logdict['time']
+ else:
+ logdict['inputs_per_sec'] = 0.0
+
+ for k in sorted(additional_values.keys()):
+ logdict[k] = additional_values[k]
+
+ # Write the headers if they are not written yet
+ if self.headers is None:
+ self.headers = logdict.keys()[:]
+ self.logstr(",".join(self.headers))
+
+ self.logstr(",".join([str(v) for v in logdict.values()]))
diff --git a/caffe2/python/extension_loader.py b/caffe2/python/extension_loader.py
index 6b8573d..e5c0907 100644
--- a/caffe2/python/extension_loader.py
+++ b/caffe2/python/extension_loader.py
@@ -3,15 +3,11 @@
from __future__ import print_function
from __future__ import unicode_literals
import contextlib
-import sys
-import DLFCN
-
@contextlib.contextmanager
def DlopenGuard():
- # In python 2.7 required constants are not defined.
- # Thus they are listed explicitly
- flags = sys.getdlopenflags()
- sys.setdlopenflags(DLFCN.RTLD_GLOBAL | DLFCN.RTLD_NOW)
+ # This is a stub for setting up special tricks around python extensions
+ # loading. For example, it might do
+ # sys.setdlopenflags(DLFCN.RTLD_GLOBAL | DLFCN.RTLD_NOW)
+ # which might be required in some setups of python
yield
- sys.setdlopenflags(flags)
diff --git a/caffe2/python/gradient_checker.py b/caffe2/python/gradient_checker.py
index ffd1f9e..edf9118 100644
--- a/caffe2/python/gradient_checker.py
+++ b/caffe2/python/gradient_checker.py
@@ -42,7 +42,17 @@
# Run gradient ops
workspace.RunOperatorsOnce(grad_ops)
# Get gradients
- grad = workspace.FetchBlob(grad_name)
+ if isinstance(grad_name, core.GradientSlice):
+ workspace.FeedBlob('zeros', np.zeros_like(x, dtype=np.float32))
+ workspace.FeedBlob('one', np.ones(1, dtype=np.float32))
+ sparse_to_dense_op = core.CreateOperator(
+ 'ScatterWeightedSum',
+ ['zeros', 'one', grad_name.indices, grad_name.values, 'one'],
+ 'zeros')
+ workspace.RunOperatorOnce(sparse_to_dense_op)
+ grad = workspace.FetchBlob('zeros')
+ else:
+ grad = workspace.FetchBlob(grad_name)
return loss, grad
def CheckSimple(
@@ -86,10 +96,6 @@
grad_ops, g_input = core.GradientRegistry.GetGradientForOp(
op, [s + '_grad' for s in op.output])
- # sanity check: we only support dense gradient checking in this checker
- assert all(type(g) is not core.GradientSlice for g in g_input), \
- "This checker does not support sparse gradient yet."""
-
dims_to_check = inputs[input_to_check].size
# First, feed in the input.
for i, arr in enumerate(inputs):
diff --git a/caffe2/python/hypothesis_test.py b/caffe2/python/hypothesis_test.py
index e0df47c..5cb2e7d 100644
--- a/caffe2/python/hypothesis_test.py
+++ b/caffe2/python/hypothesis_test.py
@@ -46,7 +46,7 @@
f_t = sigmoid(f_t)
o_t = sigmoid(o_t)
g_t = tanh(g_t)
- valid = (seq_lengths < t).astype(np.int32)
+ valid = (t < seq_lengths).astype(np.int32)
assert valid.shape == (N, D)
cell_t = ((f_t * cell_t_prev) + (i_t * g_t)) * (valid) + \
(1 - valid) * cell_t_prev
@@ -132,6 +132,7 @@
class TestOperators(hu.HypothesisTestCase):
+
def test_comparison_ops(self):
ops = {"LT": lambda x1, x2: [x1 < x2],
"LE": lambda x1, x2: [x1 <= x2],
@@ -584,8 +585,9 @@
in_place=st.booleans(),
lr=st.floats(min_value=0.1, max_value=0.9),
epsilon=st.floats(min_value=1e-5, max_value=1e-2),
+ engine=st.sampled_from([None, "SIMD"]),
**hu.gcs_cpu_only)
- def test_adagrad_sgd(self, inputs, in_place, lr, epsilon,
+ def test_adagrad_sgd(self, inputs, in_place, lr, epsilon, engine,
gc, dc):
w, grad, h = inputs
h = np.abs(h) + 0.01
@@ -595,7 +597,7 @@
["w", "h", "grad", "lr"],
["w" if in_place else "grad_o",
"h" if in_place else "h_o"],
- epsilon=epsilon, device_option=gc)
+ epsilon=epsilon, engine=engine, device_option=gc)
self.assertDeviceChecks(dc, op, [w, h, grad, lr], [0])
self.assertReferenceChecks(gc, op, [w, h, grad, lr],
@@ -604,9 +606,10 @@
@given(inputs=hu.tensors(n=3),
lr=st.floats(min_value=0.1, max_value=0.9),
epsilon=st.floats(min_value=1e-5, max_value=1e-2),
+ engine=st.sampled_from([None, "SIMD"]),
**hu.gcs_cpu_only)
def test_sparse_adagrad_sgd(self, inputs, lr, epsilon,
- gc, dc):
+ engine, gc, dc):
w, grad, h = inputs
indices = np.arange(h.shape[0])
indices = indices[indices % 2 == 0]
@@ -618,6 +621,7 @@
["param", "h", "indices", "grad", "lr"],
["param", "h"],
epsilon=epsilon,
+ engine=engine,
device_option=gc)
self.assertDeviceChecks(
dc, op, [w, h, indices, grad, lr], [0])
@@ -876,12 +880,12 @@
sids = []
for i, l in enumerate(lengths):
sids.extend(l * [i])
- return (np.array(sids, dtype=int), )
+ return (np.array(sids, dtype=np.int32), )
self.assertReferenceChecks(
device_option=gc,
op=op,
- inputs=[np.array(lengths, dtype=int)],
+ inputs=[np.array(lengths, dtype=np.int32)],
reference=op_ref)
@given(lengths=st.lists(st.integers(min_value=0, max_value=10),
@@ -903,7 +907,7 @@
self.assertReferenceChecks(
device_option=gc,
op=op,
- inputs=[np.array(lengths, dtype=int)],
+ inputs=[np.array(lengths, dtype=np.int32)],
reference=op_ref)
@given(prediction=hu.arrays(dims=[10, 3],
@@ -970,7 +974,7 @@
def ids_to_lengths(ids):
ids_length = len(ids)
if ids_length == 0:
- return (np.array([], dtype=int),)
+ return (np.array([], dtype=np.int32),)
lengths = []
# segment id starts with 0
@@ -988,14 +992,68 @@
tmp_length = 0
tmp_length += 1
lengths.append(tmp_length)
- return (np.array(lengths, dtype=int),)
+ return (np.array(lengths, dtype=np.int32),)
self.assertReferenceChecks(
device_option=gc,
op=op,
- inputs=[np.array(segment_ids, dtype=int)],
+ inputs=[np.array(segment_ids, dtype=np.int32)],
reference=ids_to_lengths)
+ @given(lengths=st.lists(st.integers(min_value=1, max_value=10),
+ min_size=0,
+ max_size=10),
+ power=st.sampled_from([0.5, 1.0, 1.5, 2.0]),
+ **hu.gcs_cpu_only)
+ def test_segment_ids_to_lengths_weight(self, lengths, power, gc, dc):
+ op = core.CreateOperator(
+ "SegmentIdsToLengthWeights",
+ ["segment_ids"],
+ ["lengths"],
+ power=power)
+
+ def lengths_to_ids(lengths):
+ sids = []
+ for i, l in enumerate(lengths):
+ sids.extend(l * [i])
+ return sids
+
+ segment_ids = lengths_to_ids(lengths)
+
+ def ids_to_length_weights(ids):
+ ids_length = len(ids)
+ if ids_length == 0:
+ return (np.array([], dtype=float),)
+
+ lengths = []
+ # segment id starts with 0
+ prev_id = -1
+ tmp_length = 0
+ for idx in range(ids_length):
+ cur_id = ids[idx]
+ if cur_id != prev_id:
+ if idx != 0:
+ lengths.append(tmp_length)
+ while prev_id + 1 != cur_id:
+ lengths.append(0)
+ prev_id += 1
+ prev_id = cur_id
+ tmp_length = 0
+ tmp_length += 1
+ lengths.append(tmp_length)
+
+ weighted_length = []
+ for l in lengths:
+ weighted_length.extend(l * [1 / pow(l, power)])
+
+ return (np.array(weighted_length, dtype=float),)
+
+ self.assertReferenceChecks(
+ device_option=gc,
+ op=op,
+ inputs=[np.array(segment_ids, dtype=np.int32)],
+ reference=ids_to_length_weights)
+
@given(input_tensor=hu.arrays(
dims=[10], elements=st.floats(allow_nan=False,
allow_infinity=False)),
diff --git a/caffe2/python/hypothesis_test_util.py b/caffe2/python/hypothesis_test_util.py
index aeb2620..2de9fc0 100644
--- a/caffe2/python/hypothesis_test_util.py
+++ b/caffe2/python/hypothesis_test_util.py
@@ -109,6 +109,8 @@
def segment_ids(size, is_sorted):
+ if size == 0:
+ return st.just(np.empty(shape=[0], dtype=np.int32))
if is_sorted:
return arrays(
[size],
@@ -122,30 +124,73 @@
elements=st.integers(min_value=0, max_value=2 * size))
-def segmented_tensor(min_dim=1, max_dim=4, dtype=np.float32, is_sorted=True,
- elements=None, **kwargs):
+def lengths(size, **kwargs):
+ # First generate number of boarders between segments
+ # Then create boarder values and add 0 and size
+ # By sorting and computing diff we convert them to lengths of
+ # possible 0 value
+ if size == 0:
+ return st.just(np.empty(shape=[0], dtype=np.int32))
+ return st.integers(
+ min_value=0, max_value=size - 1
+ ).flatmap(lambda num_boarders:
+ hypothesis.extra.numpy.arrays(
+ np.int32, num_boarders, elements=st.integers(
+ min_value=0, max_value=size
+ )
+ )
+ ).map(lambda x: np.append(x, np.array([0, size], dtype=np.int32))
+ ).map(sorted).map(np.diff)
+
+
+def segmented_tensor(
+ min_dim=1,
+ max_dim=4,
+ dtype=np.float32,
+ is_sorted=True,
+ elements=None,
+ segment_generator=segment_ids,
+ allow_empty=False,
+ **kwargs
+):
+ gen_empty = st.booleans() if allow_empty else st.just(False)
data_dims_ = st.lists(dims(**kwargs), min_size=min_dim, max_size=max_dim)
+ data_dims_ = st.tuples(
+ gen_empty, data_dims_
+ ).map(lambda pair: ([0] if pair[0] else []) + pair[1])
return data_dims_.flatmap(lambda data_dims: st.tuples(
arrays(data_dims, dtype, elements),
- segment_ids(data_dims[0], is_sorted=is_sorted),
+ segment_generator(data_dims[0], is_sorted=is_sorted),
))
+def lengths_tensor(*args, **kwargs):
+ return segmented_tensor(*args, segment_generator=lengths, **kwargs)
+
+
def sparse_segmented_tensor(min_dim=1, max_dim=4, dtype=np.float32,
- is_sorted=True, elements=None, **kwargs):
+ is_sorted=True, elements=None, allow_empty=False,
+ segment_generator=segment_ids, **kwargs):
+ gen_empty = st.booleans() if allow_empty else st.just(False)
data_dims_ = st.lists(dims(**kwargs), min_size=min_dim, max_size=max_dim)
- all_dims_ = data_dims_.flatmap(lambda data_dims: st.tuples(
- st.just(data_dims),
- st.integers(min_value=1, max_value=data_dims[0]),
- ))
+ all_dims_ = st.tuples(gen_empty, data_dims_).flatmap(
+ lambda pair: st.tuples(
+ st.just(pair[1]),
+ (st.integers(min_value=1, max_value=pair[1][0]) if not pair[0]
+ else st.just(0)),
+ ))
return all_dims_.flatmap(lambda dims: st.tuples(
arrays(dims[0], dtype, elements),
arrays(dims[1], dtype=np.int64, elements=st.integers(
min_value=0, max_value=dims[0][0] - 1)),
- segment_ids(dims[1], is_sorted=is_sorted),
+ segment_generator(dims[1], is_sorted=is_sorted),
))
+def sparse_lengths_tensor(**kwargs):
+ return sparse_segmented_tensor(segment_generator=lengths, **kwargs)
+
+
def tensors(n, min_dim=1, max_dim=4, dtype=np.float32, elements=None, **kwargs):
dims_ = st.lists(dims(**kwargs), min_size=min_dim, max_size=max_dim)
return dims_.flatmap(
diff --git a/caffe2/python/model_helper.py b/caffe2/python/model_helper.py
new file mode 100644
index 0000000..26b8b97
--- /dev/null
+++ b/caffe2/python/model_helper.py
@@ -0,0 +1,130 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from caffe2.python import core
+
+import logging
+
+
+class ModelHelperBase(object):
+ """A helper model so we can write models more easily, without having to
+ manually define parameter initializations and operators separately.
+ In order to add support for specific operators, inherit from this class
+ and add corresponding methods. Operator representing methods should
+ take care of adding their parameters to params
+ """
+
+ def __init__(self, name=None, init_params=True, allow_not_known_ops=True):
+ if name is None:
+ name = "model"
+ self.net = core.Net(name)
+ self.param_init_net = core.Net(name + '_init')
+
+ self.param_to_grad = {}
+ self.params = []
+ self.gradient_ops_added = False
+ self.init_params = init_params
+ self.allow_not_known_ops = allow_not_known_ops
+
+ def Proto(self):
+ return self.net.Proto()
+
+ def InitProto(self):
+ return self.param_init_net.Proto()
+
+ def RunAllOnGPU(self, *args, **kwargs):
+ self.param_init_net.RunAllOnGPU(*args, **kwargs)
+ self.net.RunAllOnGPU(*args, **kwargs)
+
+ def CreateDB(self, blob_out, db, db_type, **kwargs):
+ dbreader = self.param_init_net.CreateDB(
+ [], blob_out, db=db, db_type=db_type, **kwargs)
+ return dbreader
+
+ def AddGradientOperators(self, *args, **kwargs):
+ if self.gradient_ops_added:
+ raise RuntimeError("You cannot run AddGradientOperators twice.")
+ self.gradient_ops_added = True
+ grad_map = self.net.AddGradientOperators(*args, **kwargs)
+ for p in self.params:
+ if str(p) in grad_map:
+ self.param_to_grad[p] = grad_map[str(p)]
+ return grad_map
+
+ def TensorProtosDBInput(
+ self, unused_blob_in, blob_out, batch_size, db, db_type, **kwargs
+ ):
+ """TensorProtosDBInput."""
+ dbreader_name = "dbreader_" + db
+ dbreader = self.param_init_net.CreateDB(
+ [], dbreader_name,
+ db=db, db_type=db_type)
+ return self.net.TensorProtosDBInput(
+ dbreader, blob_out, batch_size=batch_size)
+
+ def AddOperator(self, op_type, inputs, parameters, *args, **kwargs):
+ """
+ Adds an operator to a model. Use parameters list
+ to specify which operator inputs are model parameters to be
+ optimized.
+
+ Example of usage:
+
+ model.SparseLengthsSum(
+ [embedding, indices, lengths],
+ parameters=[embedding],
+ )
+
+ Here embedding is a parameter to be optimized while indices
+ and lengths are not.
+ """
+
+ extra_parameters = filter(lambda x: (x not in inputs), parameters)
+ if len(extra_parameters) > 0:
+ raise Exception("Some parameters are not inputs: {}".format(
+ map(str, extra_parameters)
+ ))
+
+ self.params.extend(parameters)
+ return self.net.__getattr__(op_type)(inputs, *args, **kwargs)
+
+ def __getattr__(self, op_type):
+ """Catch-all for all other operators, mostly those without params."""
+ if not core.IsOperator(op_type):
+ raise RuntimeError(
+ 'Method ' + op_type + ' is not a registered operator.'
+ )
+ # known_working_ops are operators that do not need special care.
+ known_working_ops = [
+ "Accuracy",
+ "Adam",
+ "AveragedLoss",
+ "Cast",
+ "EnsureCPUOutput",
+ "LabelCrossEntropy",
+ "LearningRate",
+ "Print",
+ "Sigmoid",
+ "Scale",
+ "Snapshot",
+ "Softmax",
+ "StopGradient",
+ "Summarize",
+ "Tanh",
+ "WeightedSum",
+ "SquaredL2Distance",
+ "FlattenToVec",
+ "NHWC2NCHW",
+ "ScatterWeightedSum",
+ "Squeeze",
+ "NCCLAllreduce",
+ "ConstantFill",
+ "Add",
+ "DequeueBlobs",
+ ]
+ if op_type not in known_working_ops:
+ assert self.allow_not_known_ops
+ logging.warning("You are creating an op that the ModelHelperBase "
+ "does not recognize: {}.".format(op_type))
+ return self.net.__getattr__(op_type)
diff --git a/caffe2/python/models/resnet.py b/caffe2/python/models/resnet.py
new file mode 100644
index 0000000..30caa77
--- /dev/null
+++ b/caffe2/python/models/resnet.py
@@ -0,0 +1,255 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+'''
+Utility for creating ResNets
+See "Deep Residual Learning for Image Recognition" by He, Zhang et. al. 2015
+'''
+
+
+class ResNetBuilder():
+ '''
+ Helper class for constructing residual blocks.
+ '''
+ def __init__(self, model, prev_blob):
+ self.model = model
+ self.comp_count = 0
+ self.comp_idx = 0
+ self.prev_blob = prev_blob
+
+ def add_conv(self, in_filters, out_filters, kernel, stride=1, pad=0):
+ self.comp_idx += 1
+ self.prev_blob = self.model.Conv(
+ self.prev_blob,
+ 'comp_%d_conv_%d' % (self.comp_count, self.comp_idx),
+ in_filters,
+ out_filters,
+ weight_init=("MSRAFill", {}),
+ kernel=kernel,
+ stride=stride,
+ pad=pad
+ )
+ return self.prev_blob
+
+ def add_relu(self):
+ self.prev_blob = self.model.Relu(
+ self.prev_blob,
+ 'comp_%d_relu_%d' % (self.comp_count, self.comp_idx)
+ )
+ return self.prev_blob
+
+ def add_spatial_bn(self, num_filters):
+ self.prev_blob = self.model.SpatialBN(
+ self.prev_blob,
+ 'comp_%d_spatbn_%d' % (self.comp_count, self.comp_idx),
+ num_filters,
+ epsilon=1e-3
+ )
+ return self.prev_blob
+
+ '''
+ Add a "bottleneck" component as decribed in He et. al. Figure 3 (right)
+ '''
+ def add_bottleneck(
+ self,
+ input_filters, # num of feature maps from preceding layer
+ base_filters, # num of filters internally in the component
+ output_filters, # num of feature maps to output
+ down_sampling=False,
+ spatial_batch_norm=True,
+ ):
+ self.comp_idx = 0
+ shortcut_blob = self.prev_blob
+
+ # 1x1
+ self.add_conv(
+ input_filters,
+ base_filters,
+ kernel=1,
+ stride=1
+ )
+
+ if spatial_batch_norm:
+ self.add_spatial_bn(base_filters)
+
+ self.add_relu()
+
+ # 3x3 (note the pad, required for keeping dimensions)
+ self.add_conv(
+ base_filters,
+ base_filters,
+ kernel=3,
+ stride=(1 if down_sampling is False else 2),
+ pad=1
+ )
+
+ if spatial_batch_norm:
+ self.add_spatial_bn(base_filters)
+ self.add_relu()
+
+ # 1x1
+ last_conv = self.add_conv(base_filters, output_filters, kernel=1)
+ if spatial_batch_norm:
+ last_conv = self.add_spatial_bn(output_filters)
+
+ # Summation with input signal (shortcut)
+ # If we need to increase dimensions (feature maps), need to
+ # do do a projection for the short cut
+ if (output_filters > input_filters):
+ shortcut_blob = self.model.Conv(
+ shortcut_blob,
+ 'shortcut_projection_%d' % self.comp_count,
+ input_filters,
+ output_filters,
+ weight_init=("MSRAFill", {}),
+ kernel=1,
+ stride=(1 if down_sampling is False else 2)
+ )
+ if spatial_batch_norm:
+ shortcut_blob = self.model.SpatialBN(
+ shortcut_blob,
+ 'shortcut_projection_%d_spatbn' % self.comp_count,
+ output_filters,
+ epsilon=1e-3,
+ )
+
+ self.prev_blob = self.model.Sum(
+ [shortcut_blob, last_conv],
+ 'comp_%d_sum_%d' % (self.comp_count, self.comp_idx)
+ )
+ self.comp_idx += 1
+ self.add_relu()
+
+ # Keep track of number of high level components if this ResNetBuilder
+ self.comp_count += 1
+
+ def add_simple_block(
+ self,
+ input_filters,
+ num_filters,
+ down_sampling=False,
+ spatial_batch_norm=True
+ ):
+ self.comp_idx = 0
+ shortcut_blob = self.prev_blob
+
+ # 3x3
+ self.add_conv(
+ input_filters,
+ num_filters,
+ kernel=3,
+ stride=(1 if down_sampling is False else 2),
+ pad=1
+ )
+
+ if spatial_batch_norm:
+ self.add_spatial_bn(num_filters)
+ self.add_relu()
+
+ last_conv = self.add_conv(num_filters, num_filters, kernel=3, pad=1)
+ if spatial_batch_norm:
+ last_conv = self.add_spatial_bn(num_filters)
+
+ # Increase of dimensions, need a projection for the shortcut
+ if (num_filters != input_filters):
+ shortcut_blob = self.model.Conv(
+ shortcut_blob,
+ 'shortcut_projection_%d' % self.comp_count,
+ input_filters,
+ num_filters,
+ weight_init=("MSRAFill", {}),
+ kernel=1,
+ stride=(1 if down_sampling is False else 2),
+ )
+ if spatial_batch_norm:
+ shortcut_blob = self.model.SpatialBN(
+ shortcut_blob,
+ 'shortcut_projection_%d_spatbn' % self.comp_count,
+ num_filters,
+ epsilon=1e-3
+ )
+
+ self.prev_blob = self.model.Sum(
+ [shortcut_blob, last_conv],
+ 'comp_%d_sum_%d' % (self.comp_count, self.comp_idx)
+ )
+ self.comp_idx += 1
+ self.add_relu()
+
+ # Keep track of number of high level components if this ResNetBuilder
+ self.comp_count += 1
+
+
+def create_resnet50(model, data, num_input_channels, num_labels):
+ # conv1 + maxpool
+ model.Conv(data, 'conv1', num_input_channels, 64, weight_init=("MSRAFill", {}), kernel=7, stride=2, pad=3)
+ model.SpatialBN('conv1', 'conv1_spatbn', 64, epsilon=1e-3)
+ model.Relu('conv1_spatbn', 'relu1')
+ model.MaxPool('relu1', 'pool1', kernel=3, stride=2)
+
+ # Residual blocks...
+ builder = ResNetBuilder(model, 'pool1')
+
+ # conv2_x (ref Table 1 in He et al. (2015))
+ builder.add_bottleneck(64, 64, 256)
+ builder.add_bottleneck(256, 64, 256)
+ builder.add_bottleneck(256, 64, 256)
+
+ # conv3_x
+ builder.add_bottleneck(256, 128, 512, down_sampling=True)
+ for i in range(1, 4):
+ builder.add_bottleneck(512, 128, 512)
+
+ # conv4_x
+ builder.add_bottleneck(512, 256, 1024, down_sampling=True)
+ for i in range(1, 6):
+ builder.add_bottleneck(1024, 256, 1024)
+
+ # conv5_x
+ builder.add_bottleneck(1024, 512, 2048, down_sampling=True)
+ builder.add_bottleneck(2048, 512, 2048)
+ builder.add_bottleneck(2048, 512, 2048)
+
+ # Final layers
+ model.AveragePool(builder.prev_blob, 'final_avg', kernel=7, stride=1)
+
+ # Final dimension of the "image" is reduced to 7x7
+ model.FC('final_avg', 'pred', 2048, num_labels)
+
+ softmax = model.Softmax('pred', 'softmax')
+ return softmax
+
+
+def create_resnet_32x32(
+ model, data, num_input_channels, num_groups, num_labels
+):
+ '''
+ Create residual net for smaller images (sec 4.2 of He et. al (2015))
+ num_groups = 'n' in the paper
+ '''
+ # conv1 + maxpool
+ model.Conv(data, 'conv1', num_input_channels, 16, kernel=3, stride=1)
+ model.SpatialBN('conv1', 'conv1_spatbn', 16, epsilon=1e-3)
+ model.Relu('conv1_spatbn', 'relu1')
+
+ # Number of blocks as described in sec 4.2
+ filters = [16, 32, 64]
+
+ builder = ResNetBuilder(model, 'relu1')
+ prev_filters = 16
+ for groupidx in range(0, 3):
+ for blockidx in range(0, 2 * num_groups):
+ builder.add_simple_block(
+ prev_filters if blockidx == 0 else filters[groupidx],
+ filters[groupidx],
+ down_sampling=(True if blockidx == 0 and
+ groupidx > 0 else False))
+ prev_filters = filters[groupidx]
+
+ # Final layers
+ model.AveragePool(builder.prev_blob, 'final_avg', kernel=8, stride=1)
+ model.FC('final_avg', 'pred', 64, num_labels)
+ softmax = model.Softmax('pred', 'softmax')
+ return softmax
diff --git a/caffe2/python/muji.py b/caffe2/python/muji.py
index 837fe36..ec7b383 100644
--- a/caffe2/python/muji.py
+++ b/caffe2/python/muji.py
@@ -22,6 +22,12 @@
return device_option
+def OnCPU():
+ device_option = caffe2_pb2.DeviceOption()
+ device_option.device_type = caffe2_pb2.CPU
+ return device_option
+
+
def Allreduce(net, blobs, reduced_affix="_reduced", gpu_indices=None):
"""The general Allreduce interface that reroutes the function calls.
"""
diff --git a/caffe2/python/net_drawer.py b/caffe2/python/net_drawer.py
index 81f58ce..9645d85 100644
--- a/caffe2/python/net_drawer.py
+++ b/caffe2/python/net_drawer.py
@@ -59,18 +59,33 @@
return json.dumps(name)
-def GetPydotGraph(operators_or_net, name=None, rankdir='LR'):
+def GetOpNodeProducer(append_output, **kwargs):
+ def ReallyGetOpNode(op, op_id):
+ if op.name:
+ node_name = '%s/%s (op#%d)' % (op.name, op.type, op_id)
+ else:
+ node_name = '%s (op#%d)' % (op.type, op_id)
+ if append_output:
+ for output_name in op.output:
+ node_name += '\n' + output_name
+ return pydot.Node(node_name, **kwargs)
+ return ReallyGetOpNode
+
+
+def GetPydotGraph(
+ operators_or_net,
+ name=None,
+ rankdir='LR',
+ node_producer=None
+):
+ if node_producer is None:
+ node_producer = GetOpNodeProducer(False, **OP_STYLE)
operators, name = _rectify_operator_and_name(operators_or_net, name)
graph = pydot.Dot(name, rankdir=rankdir)
pydot_nodes = {}
pydot_node_counts = defaultdict(int)
for op_id, op in enumerate(operators):
- if op.name:
- op_node = pydot.Node(
- '%s/%s (op#%d)' % (op.name, op.type, op_id), **OP_STYLE
- )
- else:
- op_node = pydot.Node('%s (op#%d)' % (op.type, op_id), **OP_STYLE)
+ op_node = node_producer(op, op_id)
graph.add_node(op_node)
# print 'Op: %s' % op.name
# print 'inputs: %s' % str(op.input)
@@ -104,9 +119,10 @@
def GetPydotGraphMinimal(
operators_or_net,
- name,
+ name=None,
rankdir='LR',
- minimal_dependency=False
+ minimal_dependency=False,
+ node_producer=None,
):
"""Different from GetPydotGraph, hide all blob nodes and only show op nodes.
@@ -115,6 +131,8 @@
op a and b, and op b depends on a, then only the edge b->c will be drawn
because a->c will be implied.
"""
+ if node_producer is None:
+ node_producer = GetOpNodeProducer(False, **OP_STYLE)
operators, name = _rectify_operator_and_name(operators_or_net, name)
graph = pydot.Dot(name, rankdir=rankdir)
# blob_parents maps each blob name to its generating op.
@@ -122,12 +140,7 @@
# op_ancestry records the ancestors of each op.
op_ancestry = defaultdict(set)
for op_id, op in enumerate(operators):
- if op.name:
- op_node = pydot.Node(
- '%s/%s (op#%d)' % (op.name, op.type, op_id), **OP_STYLE
- )
- else:
- op_node = pydot.Node('%s (op#%d)' % (op.type, op_id), **OP_STYLE)
+ op_node = node_producer(op, op_id)
graph.add_node(op_node)
# Get parents, and set up op ancestry.
parents = [
@@ -175,7 +188,7 @@
return nodes
-def _draw_steps(steps, g, skip_step_edges=False):
+def _draw_steps(steps, g, skip_step_edges=False): # noqa
kMaxParallelSteps = 3
def get_label():
@@ -253,6 +266,9 @@
help="If set, only draw minimal dependency."
)
parser.add_argument(
+ "--append_output", action="store_true",
+ help="If set, append the output blobs to the operator names.")
+ parser.add_argument(
"--rankdir", type=str, default="LR",
help="The rank direction of the pydot graph."
)
@@ -268,13 +284,17 @@
for key, operators in graphs.items():
if args.minimal:
graph = GetPydotGraphMinimal(
- operators, key,
+ operators,
+ name=key,
rankdir=args.rankdir,
+ node_producer=GetOpNodeProducer(args.append_output, **OP_STYLE),
minimal_dependency=args.minimal_dependency)
else:
graph = GetPydotGraph(
- operators, key,
- rankdir=args.rankdir)
+ operators,
+ name=key,
+ rankdir=args.rankdir,
+ node_producer=GetOpNodeProducer(args.append_output, **OP_STYLE))
filename = args.output_prefix + graph.get_name() + '.dot'
graph.write(filename, format='raw')
pdf_filename = filename[:-3] + 'pdf'
diff --git a/caffe2/python/op/python_op.cpp b/caffe2/python/op/python_op.cpp
index 0d76cb9..0396ee7 100644
--- a/caffe2/python/op/python_op.cpp
+++ b/caffe2/python/op/python_op.cpp
@@ -64,7 +64,29 @@
try {
pyFunc(inputs, outputs);
} catch (const py::error_already_set& e) {
- LOG(ERROR) << "Python exception: " << e.what();
+ LOG(ERROR) << "Exception encountered running PythonOp function: "
+ << e.what() << "\nTraceback: ";
+ PyObject *type = nullptr, *value = nullptr, *trace = nullptr;
+ PyErr_Fetch(&type, &value, &trace);
+ PyTracebackObject* traceback =
+ reinterpret_cast<PyTracebackObject*>(trace);
+ vector<PyTracebackObject*> trace_vec;
+ while (traceback) {
+ trace_vec.push_back(traceback);
+ traceback = traceback->tb_next;
+ }
+ for (int i = trace_vec.size() - 1; i >= 0; --i) {
+ int line = trace_vec[i]->tb_lineno;
+ const char* filename =
+ PyString_AsString(trace_vec[i]->tb_frame->f_code->co_filename);
+ const char* funcname =
+ PyString_AsString(trace_vec[i]->tb_frame->f_code->co_name);
+ LOG(ERROR) << " # " << trace_vec.size() - i - 1 << " " << filename
+ << " (" << line << "): " << funcname;
+ }
+ Py_XDECREF(type);
+ Py_XDECREF(value);
+ Py_XDECREF(trace);
return false;
}
}
diff --git a/caffe2/python/op/python_test.py b/caffe2/python/op/python_test.py
index 9eed723..b3fa04d 100644
--- a/caffe2/python/op/python_test.py
+++ b/caffe2/python/op/python_test.py
@@ -10,6 +10,14 @@
import numpy as np
+def SubFunctionThatThrowsRuntimeError():
+ raise RuntimeError("This is an intentional exception.")
+
+
+def MainOpFunctionThatThrowsRuntimeError(inputs, _):
+ return SubFunctionThatThrowsRuntimeError()
+
+
class PythonOpTest(hu.HypothesisTestCase):
@given(x=hu.tensor())
def test_feed(self, x):
@@ -22,6 +30,11 @@
workspace.FeedBlob("x", x)
workspace.RunOperatorOnce(op)
+ def test_exception(self):
+ op = CreatePythonOperator(MainOpFunctionThatThrowsRuntimeError, [], [])
+ with self.assertRaises(RuntimeError):
+ workspace.RunOperatorOnce(op)
+
@given(x=hu.tensor())
def test_feed_with_helper_function(self, x):
def f(inputs, _):
@@ -65,7 +78,7 @@
def f(inputs, outputs):
try:
raise Exception("Exception in handler")
- except:
+ except Exception:
pass
op = CreatePythonOperator(f, ["x"], ["y"])
diff --git a/caffe2/python/operator_test/conv_test.py b/caffe2/python/operator_test/conv_test.py
index 5f7ed43..77a3981 100644
--- a/caffe2/python/operator_test/conv_test.py
+++ b/caffe2/python/operator_test/conv_test.py
@@ -26,6 +26,7 @@
batch_size=st.integers(1, 3),
order=st.sampled_from(["NCHW", "NHWC"]),
engine=st.sampled_from(["", "EIGEN"]),
+ shared_buffer=st.booleans(),
**hu.gcs)
@settings(max_examples=2, timeout=100)
def test_convolution_separate_stride_pad_gradients(self, stride_h, stride_w,
@@ -34,7 +35,8 @@
input_channels,
output_channels,
batch_size, order,
- engine, gc, dc):
+ engine, shared_buffer,
+ gc, dc):
op = core.CreateOperator(
"Conv",
["X", "w", "b"],
@@ -48,6 +50,7 @@
kernel=kernel,
order=order,
engine=engine,
+ shared_buffer=int(shared_buffer),
)
X = np.random.rand(
batch_size, size, size, input_channels).astype(np.float32) - 0.5
diff --git a/caffe2/python/operator_test/conv_transpose_test.py b/caffe2/python/operator_test/conv_transpose_test.py
index 1669947..b7b008f 100644
--- a/caffe2/python/operator_test/conv_transpose_test.py
+++ b/caffe2/python/operator_test/conv_transpose_test.py
@@ -19,11 +19,13 @@
input_channels=st.integers(1, 8),
output_channels=st.integers(1, 8),
batch_size=st.integers(1, 3),
- engine=st.sampled_from(["", "CUDNN"]), **hu.gcs)
+ engine=st.sampled_from(["", "CUDNN"]),
+ shared_buffer=st.booleans(),
+ **hu.gcs)
def test_convolution_transpose_layout(self, stride, pad, kernel, adj,
size, input_channels,
output_channels, batch_size,
- engine, gc, dc):
+ engine, shared_buffer, gc, dc):
assume(adj < stride)
X = np.random.rand(
batch_size, size, size, input_channels).astype(np.float32) - 0.5
@@ -43,6 +45,7 @@
adj=adj,
order=order,
engine=engine,
+ shared_buffer=int(shared_buffer),
device_option=gc,
)
if order == "NCHW":
diff --git a/caffe2/python/operator_test/dataset_ops_test.py b/caffe2/python/operator_test/dataset_ops_test.py
index 69d7420..83d6128 100644
--- a/caffe2/python/operator_test/dataset_ops_test.py
+++ b/caffe2/python/operator_test/dataset_ops_test.py
@@ -314,3 +314,63 @@
workspace.RunNet(str(read_next_net))
actual = FetchRecord(batch)
_assert_records_equal(actual, entry)
+
+ def test_collect_tensor_ops(self):
+ init_net = core.Net('init_net')
+ blobs = ['blob_1', 'blob_2', 'blob_3']
+ bvec_map = {}
+ ONE = init_net.ConstantFill([], 'ONE', shape=[1, 2], value=1)
+ for b in blobs:
+ init_net.ConstantFill([], [b], shape=[1, 2], value=0)
+ bvec_map[b] = b + '_vec'
+ init_net.CreateTensorVector([], [bvec_map[b]])
+
+ reader_net = core.Net('reader_net')
+ for b in blobs:
+ reader_net.Add([b, ONE], [b])
+
+ collect_net = core.Net('collect_net')
+ num_to_collect = 1000
+ max_example_to_cover = 100000
+ for i, b in enumerate(blobs):
+ if i == 0:
+ bvec_map[b], position = collect_net.CollectTensor(
+ [bvec_map[b], b], [bvec_map[b], 'position'],
+ num_to_collect=num_to_collect)
+ else:
+ # sample in the same way as the first blob
+ bvec_map[b], position = collect_net.CollectTensor(
+ [bvec_map[b], b, position], [bvec_map[b], position],
+ num_to_collect=num_to_collect)
+
+ print('Collect Net Proto: {}'.format(collect_net.Proto()))
+
+ plan = core.Plan('collect_data')
+ plan.AddStep(core.execution_step('collect_init', init_net))
+ plan.AddStep(core.execution_step('collect_data',
+ [reader_net, collect_net],
+ num_iter=max_example_to_cover))
+ workspace.RunPlan(plan)
+
+ # concat the collected tensors
+ concat_net = core.Net('concat_net')
+ bconcated_map = {}
+ for b in blobs:
+ bconcated_map[b] = b + '_concated'
+ concat_net.ConcatTensorVector([bvec_map[b]], [bconcated_map[b]])
+
+ workspace.RunNetOnce(concat_net)
+
+ # check data
+ reference_result = workspace.FetchBlob(bconcated_map[blobs[0]])
+ self.assertEqual(reference_result.shape,
+ (min(num_to_collect, max_example_to_cover), 2))
+
+ hist, _ = np.histogram(reference_result[:, 0], bins=10,
+ range=(1, max_example_to_cover))
+ print('Sample histogram: {}'.format(hist))
+
+ self.assertTrue(all(hist > 0.7 * (num_to_collect / 10)))
+ for i in range(1, len(blobs)):
+ result = workspace.FetchBlob(bconcated_map[blobs[i]])
+ self.assertEqual(reference_result.tolist(), result.tolist())
diff --git a/caffe2/python/operator_test/gather_ranges_op_test.py b/caffe2/python/operator_test/gather_ranges_op_test.py
new file mode 100644
index 0000000..7183bcf
--- /dev/null
+++ b/caffe2/python/operator_test/gather_ranges_op_test.py
@@ -0,0 +1,69 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from hypothesis import given
+from hypothesis import strategies as st
+
+import caffe2.python.hypothesis_test_util as hu
+import numpy as np
+
+
+def batched_boarders_and_data(
+ data_min_size=5, data_max_size=10,
+ examples_min_number=1, examples_max_number=4,
+ example_min_size=1, example_max_size=3,
+ dtype=np.float32, elements=None):
+ dims_ = st.tuples(
+ st.integers(min_value=data_min_size,
+ max_value=data_max_size),
+ st.integers(min_value=examples_min_number,
+ max_value=examples_max_number),
+ st.integers(min_value=example_min_size,
+ max_value=example_max_size),
+ )
+ return dims_.flatmap(
+ lambda dims: st.tuples(
+ hu.arrays(
+ [dims[1], dims[2], 2], dtype=np.int32,
+ elements=st.integers(min_value=0, max_value=dims[0])
+ ),
+ hu.arrays([dims[0]], dtype, elements)
+ ))
+
+
+def gather_ranges(data, ranges):
+ lengths = []
+ output = []
+ for example_ranges in ranges:
+ length = 0
+ for range in example_ranges:
+ assert len(range) == 2
+ output.extend(data[range[0]:range[0] + range[1]])
+ length += range[1]
+ lengths.append(length)
+ return output, lengths
+
+
+class TestGatherRanges(hu.HypothesisTestCase):
+ @given(boarders_and_data=batched_boarders_and_data(), **hu.gcs_cpu_only)
+ def test_gather_ranges(self, boarders_and_data, gc, dc):
+ boarders, data = boarders_and_data
+
+ def boarders_to_range(boarders):
+ assert len(boarders) == 2
+ boarders = sorted(boarders)
+ return [boarders[0], boarders[1] - boarders[0]]
+
+ ranges = np.apply_along_axis(boarders_to_range, 2, boarders)
+
+ self.assertReferenceChecks(
+ device_option=gc,
+ op=core.CreateOperator("GatherRanges",
+ ["data", "ranges"],
+ ["output", "lengths"]),
+ inputs=[data, ranges],
+ reference=gather_ranges,
+ )
diff --git a/caffe2/python/operator_test/matmul_op_test.py b/caffe2/python/operator_test/matmul_op_test.py
index 5d27339..b656d21 100644
--- a/caffe2/python/operator_test/matmul_op_test.py
+++ b/caffe2/python/operator_test/matmul_op_test.py
@@ -46,3 +46,42 @@
self.assertGradientChecks(gc, op, [X, Y], 0, [0])
# Gradient check wrt Y
self.assertGradientChecks(gc, op, [X, Y], 1, [0])
+
+class TestBatchMatMul(hu.HypothesisTestCase):
+ @given(C=st.integers(min_value=1, max_value=10),
+ M=st.integers(min_value=1, max_value=10),
+ K=st.integers(min_value=1, max_value=10),
+ N=st.integers(min_value=1, max_value=10),
+ trans_a=st.booleans(),
+ trans_b=st.booleans(),
+ **hu.gcs)
+ def test_matmul(self, C, M, K, N, trans_a, trans_b, gc, dc):
+ X = np.random.randn(C, M, K).astype(np.float32)
+ if trans_a:
+ X = X.swapaxes(1, 2)
+
+ Y = np.random.randn(C, K, N).astype(np.float32)
+ if trans_b:
+ Y = Y.swapaxes(1, 2)
+
+ op = core.CreateOperator(
+ 'BatchMatMul', ['X', 'Y'], 'out',
+ trans_a=trans_a, trans_b=trans_b)
+
+ def matmul_ref(X, Y, trans_a, trans_b):
+ XX = X.swapaxes(1, 2) if trans_a else X
+ YY = Y.swapaxes(1, 2) if trans_b else Y
+ output = np.zeros((C, M, N)).astype(XX.dtype)
+ for i in range(C):
+ output[i] = XX[i].dot(YY[i])
+ return (output,)
+
+ # Check against numpy reference
+ self.assertReferenceChecks(gc, op, [X, Y, trans_a, trans_b],
+ matmul_ref)
+ # Check over multiple devices
+ self.assertDeviceChecks(dc, op, [X, Y], [0])
+ # Gradient check wrt X
+ self.assertGradientChecks(gc, op, [X, Y], 0, [0])
+ # Gradient check wrt Y
+ self.assertGradientChecks(gc, op, [X, Y], 1, [0])
diff --git a/caffe2/python/operator_test/mkl_ops_test.py b/caffe2/python/operator_test/mkl_ops_test.py
new file mode 100644
index 0000000..4d34559
--- /dev/null
+++ b/caffe2/python/operator_test/mkl_ops_test.py
@@ -0,0 +1,64 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+import hypothesis.strategies as st
+from hypothesis import given
+import numpy as np
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+
+
+@unittest.skipIf(not core.IsOperator("PackedFC"),
+ "PackedFC is not supported in this caffe2 build.")
+class PackedFCTest(hu.HypothesisTestCase):
+ @given(seed=st.integers(0, 65536),
+ M=st.integers(16, 32),
+ K=st.integers(128, 1024),
+ N=st.integers(128, 1024),
+ **hu.gcs_cpu_only)
+ def test_packed_fc(self, seed, M, K, N, gc, dc):
+ np.random.seed(seed)
+ X = np.random.rand(M, K).astype(np.float32) - 0.5
+ W = np.random.rand(N, K).astype(np.float32) - 0.5
+ b = np.random.rand(N).astype(np.float32) - 0.5
+
+ # If you are debugging, the following hard-coded ones might help.
+ # X = np.ones((24, 256)).astype(np.float32)
+ # W = np.ones((128, 256)).astype(np.float32)
+ # b = np.zeros(128).astype(np.float32)
+
+ def ref(X, W, b):
+ return (np.dot(X, W.T) + b,)
+
+ for name in ["FC", "PackedFC"]:
+ op = core.CreateOperator(
+ name,
+ ["X", "W", "b"],
+ ["Y"],
+ )
+ self.assertReferenceChecks(gc, op, [X, W, b], ref)
+
+ @given(axis=st.integers(min_value=1, max_value=4),
+ num_output=st.integers(min_value=4, max_value=8),
+ **hu.gcs_cpu_only)
+ def test_packed_fc_axis(self, axis, num_output, gc, dc):
+ np.random.seed(1701)
+ X = np.random.randn(1, 2, 3, 2, 1).astype(np.float32)
+ K = np.prod(X.shape[axis:])
+ N = num_output
+ W = np.random.randn(N, K).astype(np.float32)
+ b = np.random.randn(N).astype(np.float32)
+
+ op = core.CreateOperator(
+ "PackedFC",
+ ["X", "W", "b"],
+ ["Y"],
+ axis=axis)
+
+ def ref(X, W, b):
+ return (np.dot(X.reshape(X.size / K, K), W.T) + b,)
+
+ self.assertReferenceChecks(gc, op, [X, W, b], ref)
diff --git a/caffe2/python/operator_test/mpi_test.py b/caffe2/python/operator_test/mpi_test.py
index edcd0c8..96bc668 100644
--- a/caffe2/python/operator_test/mpi_test.py
+++ b/caffe2/python/operator_test/mpi_test.py
@@ -12,10 +12,7 @@
from caffe2.python import core, workspace, dyndep
import caffe2.python.hypothesis_test_util as hu
-if workspace.has_gpu_support:
- dyndep.InitOpsLibrary("@/caffe2/caffe2/mpi:mpi_ops_gpu")
-else:
- dyndep.InitOpsLibrary("@/caffe2/caffe2/mpi:mpi_ops")
+dyndep.InitOpsLibrary("@/caffe2/caffe2/mpi:mpi_ops")
try:
from mpi4py import MPI
diff --git a/caffe2/python/operator_test/partition_ops_test.py b/caffe2/python/operator_test/partition_ops_test.py
index a06c990..e43c3b1 100644
--- a/caffe2/python/operator_test/partition_ops_test.py
+++ b/caffe2/python/operator_test/partition_ops_test.py
@@ -8,6 +8,7 @@
class TestPartitionOps(TestCase):
+
def test_configs(self):
# (main dims, partitions, main type, [list of (extra dims, type)])
configs = [
@@ -31,15 +32,15 @@
for pack in [False, True]
]
- def testSharding(self):
+ def testPartition(self):
for main_dims, parts, main_type, extra_ins, pack in self.test_configs():
ins = ['in' + str(i) for i in range(1 + len(extra_ins))]
outs = [
'in{}_p{}'.format(i, j)
- for i in range(1 + len(extra_ins)) for j in range(parts)
+ for i in range(parts) for j in range(1 + len(extra_ins))
]
op = core.CreateOperator(
- 'Sharding', ins, outs, pack_first_input=(1 if pack else 0))
+ 'Partition', ins, outs, pack_first_input=(1 if pack else 0))
x = []
for i, (dims, t) in enumerate([((), main_type)] + extra_ins):
if t in [np.float32, np.float64]:
@@ -54,21 +55,102 @@
# numpy has proper modulo op that yields non-negative results
shards = (x[0] % parts).reshape([-1])
out = []
- for ind, v in enumerate(x):
- suffix_shape = v.shape[len(x[0].shape):]
- accum = [[] for i in range(parts)]
- a = v.reshape((-1, ) + suffix_shape)
- if pack and ind == 0:
- a //= parts
- for i, s in enumerate(shards):
- accum[s].append(a[i])
+ for i in range(parts):
+ for ind, v in enumerate(x):
+ suffix_shape = v.shape[len(x[0].shape):]
+ accum = []
+ data = v.reshape((-1, ) + suffix_shape)
- def join(a):
- if not a:
- return np.empty(shape=(0, ) + suffix_shape)
- return np.stack(a)
+ if pack and ind == 0:
+ data = data // parts
- out.extend(join(a) for a in accum)
+ for j, s in enumerate(shards):
+ if s == i:
+ accum.append(data[j])
+
+ def join(a):
+ if not a:
+ return np.empty(shape=(0, ) + suffix_shape)
+ return np.stack(a)
+
+ out.append(join(accum))
+ return out
+
+ workspace.RunOperatorOnce(op)
+ ref = sharding(x)
+ print(x)
+ print(ref)
+ for name, expected in zip(outs, ref):
+ np.testing.assert_array_equal(
+ expected, workspace.FetchBlob(name)
+ )
+
+ def testLengthsPartition(self):
+ for main_dims, parts, main_type, extra_ins, pack in self.test_configs():
+ # For LengthsSharding only 1-D tensors supported as a first input
+ if len(main_dims) > 1:
+ continue
+ ins = ['in' + str(i) for i in range(2 + len(extra_ins))]
+ outs = [
+ 'in{}_p{}'.format(j, i)
+ for i in range(parts) for j in range(2 + len(extra_ins))
+ ]
+ op = core.CreateOperator(
+ 'LengthsPartition', ins, outs,
+ pack_first_input=(1 if pack else 0)
+ )
+ x = []
+ for i, (dims, t) in enumerate([((), main_type)] + extra_ins):
+ if t in [np.float32, np.float64]:
+ d = rand_array(*(main_dims + dims))
+ else:
+ d = np.random.randint(-100, 100, (main_dims + dims))
+ d = d.astype(t)
+ workspace.FeedBlob(ins[i + 1], d)
+ x.append(d)
+
+ # Randomly generate length tensor as well
+ elements = np.random.randint(2, 10)
+ lengths = []
+ total_length = 0
+ for i in range(elements - 1):
+ lengths.append(np.random.randint(main_dims[0] - total_length))
+ total_length += lengths[-1]
+ lengths.append(main_dims[0] - total_length)
+ workspace.FeedBlob(ins[0], np.array(lengths, dtype=np.int32))
+
+ def sharding(x):
+ # numpy has proper modulo op that yields non-negative results
+ shards = (x[0] % parts).reshape([-1])
+ out = []
+ for i in range(parts):
+ idx = 0
+ sharded_lengths = np.zeros(elements)
+ for ind, length in enumerate(lengths):
+ for j in range(length):
+ if shards[idx] == i:
+ sharded_lengths[ind] += 1
+ idx += 1
+ out.append(sharded_lengths)
+
+ for ind, v in enumerate(x):
+ suffix_shape = v.shape[len(x[0].shape):]
+ accum = []
+ data = v.reshape((-1, ) + suffix_shape)
+
+ if pack and ind == 0:
+ data = data // parts
+
+ for j, s in enumerate(shards):
+ if s == i:
+ accum.append(data[j])
+
+ def join(a):
+ if not a:
+ return np.empty(shape=(0, ) + suffix_shape)
+ return np.stack(a)
+
+ out.append(join(accum))
return out
workspace.RunOperatorOnce(op)
diff --git a/caffe2/python/operator_test/reshape_ops_test.py b/caffe2/python/operator_test/reshape_ops_test.py
index bf31a66..49d8e18 100644
--- a/caffe2/python/operator_test/reshape_ops_test.py
+++ b/caffe2/python/operator_test/reshape_ops_test.py
@@ -10,10 +10,10 @@
class TestLengthsToShapeOps(TestCase):
def test_lengths_to_shape_ops(self):
- workspace.FeedBlob('l', np.array([200, 200, 200], dtype=np.int64))
+ workspace.FeedBlob('l', np.array([200, 200, 200], dtype=np.int32))
workspace.RunOperatorOnce(core.CreateOperator(
'LengthsToShape', ['l'], ['s']))
- workspace.FeedBlob('res', np.array([3, 200]))
+ workspace.FeedBlob('res', np.array([3, 200], dtype=np.int32))
assert ((workspace.FetchBlob('s') == workspace.FetchBlob('res')).all())
def test_reshape_ops(self):
diff --git a/caffe2/python/operator_test/segment_ops_test.py b/caffe2/python/operator_test/segment_ops_test.py
index d98c0bb..2a11949 100644
--- a/caffe2/python/operator_test/segment_ops_test.py
+++ b/caffe2/python/operator_test/segment_ops_test.py
@@ -5,70 +5,164 @@
from caffe2.python import core
from functools import partial
from hypothesis import given
+
import caffe2.python.hypothesis_test_util as hu
import numpy as np
-def split(data, segment_ids, indices=None):
- """
- Given:
- data[M1 x M2 x ... x Md]
- the input data
- indices[N] the index of each entry of segment_ids into data,
- where 0 <= index[i] < M1,
- with default indices=[0,1,...N]
- segment_ids[N] the segment_id for each entry of indices,
+class TesterBase:
+ def segment_reduce_op(self, data, segment_ids, reducer, indices=None):
+ segments = self.split(data, segment_ids, indices)
+ output = np.zeros((len(segments), ) + data.shape[1:])
+ for i, segment in enumerate(segments):
+ output[i] = reducer(segment)
+ return output
- returns K outputs, each one containing data entries corresponding
- to one of the segments present in `segment_ids`.
- """
- K = max(segment_ids) + 1
- outputs = [
- np.zeros(
- (np.count_nonzero(segment_ids == seg_id),) + data.shape[1:],
- dtype=data.dtype)
- for seg_id in range(0, K)]
- counts = np.zeros(K)
- for i, seg_id in enumerate(segment_ids):
- data_idx = i if indices is None else indices[i]
- outputs[seg_id][counts[seg_id]] = data[data_idx]
- counts[seg_id] += 1
- return outputs
+ def segment_reduce_grad_op(
+ self,
+ data,
+ segment_ids,
+ reducer_grad,
+ grad_out,
+ output,
+ indices=None
+ ):
+ segments = self.split(data, segment_ids, indices)
+ segment_grads = [
+ reducer_grad(grad_out[i], [output[i]], [segment])
+ for i, segment in enumerate(segments)
+ ]
+ return self.unsplit(data.shape[1:], segment_grads, segment_ids)
+
+ def test(self, prefix, input_strategy, refs):
+ tester = self
+
+ @given(X=input_strategy, **hu.gcs_cpu_only)
+ def test_segment_ops(self, X, gc, dc):
+ for op_name, ref, grad_ref in refs:
+ inputs = ['input%d' % i for i in range(0, len(X))]
+ op = core.CreateOperator(prefix + op_name, inputs, ['output'])
+
+ def seg_reduce(data, *args):
+ indices, segments = (
+ args if len(args) == 2 else (None, args[0])
+ )
+ out = tester.segment_reduce_op(
+ data=data,
+ segment_ids=segments,
+ indices=indices,
+ reducer=ref
+ )
+ return (out, )
+
+ def seg_reduce_grad(grad_out, outputs, inputs):
+ data = inputs[0]
+ args = inputs[1:]
+ indices, segments = (
+ args if len(args) == 2 else (None, args[0])
+ )
+ # grad r.t. data
+ grad_val = tester.segment_reduce_grad_op(
+ data, segments, grad_ref, grad_out, outputs[0], indices
+ )
+ # if sparse, include indices along with data gradient
+ data_grad_slice = (
+ (grad_val, indices) if indices is not None else grad_val
+ )
+ # other inputs don't have gradient
+ return (data_grad_slice, ) + (None, ) * (len(inputs) - 1)
+
+ self.assertReferenceChecks(
+ device_option=gc,
+ op=op,
+ inputs=X,
+ reference=seg_reduce,
+ output_to_grad='output',
+ grad_reference=seg_reduce_grad,
+ )
+
+ return test_segment_ops
-def unsplit(inputs, segment_ids):
- """ Inverse operation to `split`, with indices=None """
- output = np.zeros((len(segment_ids),) + inputs[0].shape[1:])
- K = max(segment_ids) + 1
- counts = np.zeros(K)
- for i, seg_id in enumerate(segment_ids):
- output[i] = inputs[seg_id][counts[seg_id]]
- counts[seg_id] += 1
- return output
+class SegmentsTester(TesterBase):
+ def split(self, data, segment_ids, indices=None):
+ """
+ Given:
+ data[M1 x M2 x ... x Md]
+ the input data
+ indices[N] the index of each entry of segment_ids into data,
+ where 0 <= index[i] < M1,
+ with default indices=[0,1,...N]
+ segment_ids[N] the segment_id for each entry of indices,
+
+ returns K outputs, each one containing data entries corresponding
+ to one of the segments present in `segment_ids`.
+ """
+ if segment_ids.size == 0:
+ return []
+ K = max(segment_ids) + 1
+ outputs = [
+ np.zeros(
+ (np.count_nonzero(segment_ids == seg_id), ) + data.shape[1:],
+ dtype=data.dtype
+ ) for seg_id in range(0, K)
+ ]
+ counts = np.zeros(K)
+ for i, seg_id in enumerate(segment_ids):
+ data_idx = i if indices is None else indices[i]
+ outputs[seg_id][counts[seg_id]] = data[data_idx]
+ counts[seg_id] += 1
+ return outputs
+
+ def unsplit(self, extra_shape, inputs, segment_ids):
+ """ Inverse operation to `split`, with indices=None """
+ output = np.zeros((len(segment_ids), ) + extra_shape)
+ if len(segment_ids) == 0:
+ return output
+ K = max(segment_ids) + 1
+ counts = np.zeros(K)
+ for i, seg_id in enumerate(segment_ids):
+ output[i] = inputs[seg_id][counts[seg_id]]
+ counts[seg_id] += 1
+ return output
-def segment_reduce_op(data, segment_ids, reducer, indices=None):
- segments = split(data, segment_ids, indices)
- output = np.zeros((len(segments),) + data.shape[1:])
- for i, segment in enumerate(segments):
- output[i] = reducer(segment)
- return output
+class LengthsTester(TesterBase):
+ def split(self, data, lengths, indices=None):
+ K = len(lengths)
+ outputs = [
+ np.zeros((lengths[seg_id], ) + data.shape[1:],
+ dtype=data.dtype) for seg_id in range(0, K)
+ ]
+ start = 0
+ for i in range(0, K):
+ for j in range(0, lengths[i]):
+ data_index = start + j
+ if indices is not None:
+ data_index = indices[data_index]
+ outputs[i][j] = data[data_index]
+ start += lengths[i]
+ return outputs
-
-def segment_reduce_grad_op(data, segment_ids, reducer_grad,
- grad_out, output, indices=None):
- segments = split(data, segment_ids, indices)
- segment_grads = [
- reducer_grad(grad_out[i], [output[i]], [segment])
- for i, segment in enumerate(segments)]
- return unsplit(segment_grads, segment_ids)
+ def unsplit(self, extra_shape, inputs, lengths):
+ N = sum(lengths)
+ output = np.zeros((N, ) + extra_shape)
+ K = len(lengths)
+ assert len(inputs) == K
+ current = 0
+ for i in range(0, K):
+ for j in range(0, lengths[i]):
+ output[current] = inputs[i][j]
+ current += 1
+ return output
def sum_grad(grad_out, outputs, inputs):
return np.repeat(
np.expand_dims(grad_out, axis=0),
inputs[0].shape[0],
- axis=0)
+ axis=0
+ )
def logsumexp(x):
@@ -80,7 +174,8 @@
return np.repeat(
np.expand_dims(grad_out / sum_exps, 0),
inputs[0].shape[0],
- axis=0) * np.exp(inputs[0])
+ axis=0
+ ) * np.exp(inputs[0])
def logmeanexp(x):
@@ -95,10 +190,11 @@
return np.repeat(
np.expand_dims(grad_out / inputs[0].shape[0], 0),
inputs[0].shape[0],
- axis=0)
+ axis=0
+ )
-def max(x):
+def max_fwd(x):
return np.amax(x, axis=0)
@@ -122,9 +218,7 @@
return np.resize(flat_grad_in, inputs[0].shape)
-REFERENCES_ALL = [
- ('Sum', partial(np.sum, axis=0), sum_grad),
-]
+REFERENCES_ALL = [('Sum', partial(np.sum, axis=0), sum_grad), ]
REFERENCES_SORTED = [
('RangeSum', partial(np.sum, axis=0), sum_grad),
@@ -132,75 +226,75 @@
# gradient is the same as sum
('RangeLogMeanExp', logmeanexp, logsumexp_grad),
('RangeMean', mean, mean_grad),
- ('RangeMax', max, max_grad),
+ ('RangeMax', max_fwd, max_grad),
]
-def test(prefix, input_strategy, refs):
- @given(X=input_strategy, **hu.gcs_cpu_only)
- def test_segment_ops(self, X, gc, dc):
- for op_name, ref, grad_ref in refs:
- inputs = ['input%d' % i for i in range(0, len(X))]
- op = core.CreateOperator(prefix + op_name, inputs, ['output'])
-
- def seg_reduce(data, *args):
- indices, segment_ids = (
- args if len(args) == 2 else (None, args[0]))
- out = segment_reduce_op(
- data=data,
- segment_ids=segment_ids,
- indices=indices,
- reducer=ref)
- return (out,)
-
- def seg_reduce_grad(grad_out, outputs, inputs):
- data = inputs[0]
- args = inputs[1:]
- indices, segment_ids = (
- args if len(args) == 2 else (None, args[0]))
- # grad r.t. data
- grad_val = segment_reduce_grad_op(
- data, segment_ids, grad_ref,
- grad_out, outputs[0], indices)
- # if sparse, include indices along with data gradient
- data_grad_slice = (
- (grad_val, indices) if indices is not None else grad_val)
- # other inputs don't have gradient
- return (data_grad_slice,) + (None,) * (len(inputs) - 1)
-
- self.assertReferenceChecks(
- device_option=gc,
- op=op,
- inputs=X,
- reference=seg_reduce,
- output_to_grad='output',
- grad_reference=seg_reduce_grad,
- )
-
- return test_segment_ops
-
-
class TestSegmentOps(hu.HypothesisTestCase):
def test_sorted_segment_ops(self):
- test(
+ SegmentsTester().test(
'SortedSegment',
- hu.segmented_tensor(dtype=np.float32, is_sorted=True),
- REFERENCES_ALL + REFERENCES_SORTED)(self)
+ hu.segmented_tensor(
+ dtype=np.float32,
+ is_sorted=True,
+ allow_empty=True
+ ),
+ REFERENCES_ALL + REFERENCES_SORTED
+ )(self)
def test_unsorted_segment_ops(self):
- test(
+ SegmentsTester().test(
'UnsortedSegment',
- hu.segmented_tensor(dtype=np.float32, is_sorted=False),
- REFERENCES_ALL)(self)
+ hu.segmented_tensor(
+ dtype=np.float32,
+ is_sorted=False,
+ allow_empty=True
+ ),
+ REFERENCES_ALL
+ )(self)
def test_sparse_sorted_segment_ops(self):
- test(
+ SegmentsTester().test(
'SparseSortedSegment',
- hu.sparse_segmented_tensor(dtype=np.float32, is_sorted=True),
- REFERENCES_ALL)(self)
+ hu.sparse_segmented_tensor(
+ dtype=np.float32,
+ is_sorted=True,
+ allow_empty=True
+ ),
+ REFERENCES_ALL
+ )(self)
def test_sparse_unsorted_segment_ops(self):
- test(
+ SegmentsTester().test(
'SparseUnsortedSegment',
- hu.sparse_segmented_tensor(dtype=np.float32, is_sorted=False),
- REFERENCES_ALL)(self)
+ hu.sparse_segmented_tensor(
+ dtype=np.float32,
+ is_sorted=False,
+ allow_empty=True
+ ),
+ REFERENCES_ALL
+ )(self)
+
+ def test_lengths_ops(self):
+ LengthsTester().test(
+ 'Lengths',
+ hu.lengths_tensor(
+ dtype=np.float32,
+ min_value=1,
+ max_value=10,
+ allow_empty=True
+ ),
+ REFERENCES_ALL
+ )(self)
+
+ def test_sparse_lengths_ops(self):
+ LengthsTester().test(
+ 'SparseLengths',
+ hu.sparse_lengths_tensor(
+ dtype=np.float32,
+ min_value=1,
+ max_value=10,
+ allow_empty=True
+ ),
+ REFERENCES_ALL
+ )(self)
diff --git a/caffe2/python/operator_test/sequence_ops_test.py b/caffe2/python/operator_test/sequence_ops_test.py
index 75af83d..f833ae6 100644
--- a/caffe2/python/operator_test/sequence_ops_test.py
+++ b/caffe2/python/operator_test/sequence_ops_test.py
@@ -15,7 +15,7 @@
def gen_with_size(args):
lengths, inner_shape = args
data_dim = [sum(lengths)] + inner_shape
- lengths = np.array(lengths, dtype=np.int64)
+ lengths = np.array(lengths, dtype=np.int32)
if with_pad_data:
return st.tuples(
st.just(lengths),
diff --git a/caffe2/python/operator_test/sparse_gradient_checker_test.py b/caffe2/python/operator_test/sparse_gradient_checker_test.py
new file mode 100644
index 0000000..2a7036d
--- /dev/null
+++ b/caffe2/python/operator_test/sparse_gradient_checker_test.py
@@ -0,0 +1,42 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+from scipy.sparse import coo_matrix
+
+from hypothesis import given
+import hypothesis.strategies as st
+
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+
+
+class TestSparseGradient(hu.HypothesisTestCase):
+ @given(M=st.integers(min_value=5, max_value=20),
+ N=st.integers(min_value=5, max_value=20),
+ K=st.integers(min_value=5, max_value=15),
+ sparsity=st.floats(min_value=0.1, max_value=1.0),
+ **hu.gcs)
+ def test_sparse_gradient(self, M, N, K, sparsity, gc, dc):
+ X = np.random.randn(M, K).astype(np.float32)
+ X[X > sparsity] = 0
+ X_coo = coo_matrix(X)
+ val, key, seg = X_coo.data, X_coo.col, X_coo.row
+
+ val = val.astype(np.float32)
+ key = key.astype(np.int64)
+ seg = seg.astype(np.int32)
+
+ Y = np.random.randn(K, N).astype(np.float32)
+
+ op = core.CreateOperator(
+ 'SparseUnsortedSegmentWeightedSum',
+ ['Y', 'val', 'key', 'seg'],
+ ['out'],
+ num_segments=M)
+
+ # Gradient check wrt Y
+ self.assertGradientChecks(
+ gc, op, [Y, val, key, seg], 0, [0])
diff --git a/caffe2/python/operator_test/spatial_bn_op_test.py b/caffe2/python/operator_test/spatial_bn_op_test.py
index 22548a0..1eaca27 100644
--- a/caffe2/python/operator_test/spatial_bn_op_test.py
+++ b/caffe2/python/operator_test/spatial_bn_op_test.py
@@ -93,9 +93,7 @@
seed=st.integers(0, 65535),
order=st.sampled_from(["NCHW", "NHWC"]),
epsilon=st.floats(1e-5, 1e-2),
- **hu.gcs_gpu_only)
- @unittest.skipIf(not workspace.has_gpu_support,
- "SpatialBN gradient only implemented through gpu.")
+ **hu.gcs)
def test_spatialbn_train_mode_gradient_check(
self, size, input_channels, batch_size, seed, order, epsilon,
gc, dc):
@@ -107,7 +105,7 @@
is_test=False,
epsilon=epsilon,
)
- np.random.seed(1701)
+ np.random.seed(seed)
scale = np.random.rand(input_channels).astype(np.float32) + 0.5
bias = np.random.rand(input_channels).astype(np.float32) - 0.5
mean = np.random.randn(input_channels).astype(np.float32)
@@ -117,5 +115,6 @@
if order == "NHWC":
X = X.swapaxes(1, 2).swapaxes(2, 3)
- self.assertGradientChecks(gc, op, [X, scale, bias, mean, var],
- 0, [0])
+ for input_to_check in [0, 1, 2]: # dX, dScale, dBias
+ self.assertGradientChecks(gc, op, [X, scale, bias, mean, var],
+ input_to_check, [0])
diff --git a/caffe2/python/operator_test/square_root_divide_op_test.py b/caffe2/python/operator_test/square_root_divide_op_test.py
new file mode 100644
index 0000000..25099a6
--- /dev/null
+++ b/caffe2/python/operator_test/square_root_divide_op_test.py
@@ -0,0 +1,69 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from functools import partial
+from hypothesis import given
+from hypothesis import strategies as st
+
+import caffe2.python.hypothesis_test_util as hu
+import math
+import numpy as np
+
+
+def _data_and_scale(
+ data_min_size=4, data_max_size=10,
+ examples_min_number=1, examples_max_number=4,
+ dtype=np.float32, elements=None):
+ dims_ = st.tuples(
+ st.integers(min_value=examples_min_number,
+ max_value=examples_max_number),
+ st.integers(min_value=data_min_size,
+ max_value=data_max_size),
+ )
+ return dims_.flatmap(
+ lambda dims: st.tuples(
+ hu.arrays([dims[0], dims[1]], dtype=dtype),
+ hu.arrays(
+ [dims[0]], np.int32,
+ st.integers(min_value=5, max_value=10),
+ )
+ )
+ )
+
+
+def divide_by_square_root(data, scale):
+ output = np.copy(data)
+ num_examples = len(scale)
+
+ assert num_examples == data.shape[0]
+ assert len(data.shape) == 2
+
+ for i in range(0, num_examples):
+ if scale[i] > 0:
+ output[i] = np.multiply(data[i], 1 / math.sqrt(scale[i]))
+
+ return (output, )
+
+
+def grad(output_grad, ref_outputs, inputs):
+ return (divide_by_square_root(output_grad, inputs[1])[0],
+ None)
+
+
+class TestSquareRootDivide(hu.HypothesisTestCase):
+ @given(data_and_scale=_data_and_scale(),
+ **hu.gcs_cpu_only)
+ def test_square_root_divide(self, data_and_scale, gc, dc):
+ self.assertReferenceChecks(
+ device_option=gc,
+ op=core.CreateOperator("SquareRootDivide",
+ ["data", "scale"],
+ ["output"]),
+ inputs=list(data_and_scale),
+ reference=partial(divide_by_square_root),
+ output_to_grad="output",
+ grad_reference=grad,
+ )
diff --git a/caffe2/python/operator_test/squeeze_test.py b/caffe2/python/operator_test/squeeze_test.py
deleted file mode 100644
index c5566ac..0000000
--- a/caffe2/python/operator_test/squeeze_test.py
+++ /dev/null
@@ -1,20 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-import numpy as np
-
-from caffe2.python import core, workspace
-from caffe2.python.test_util import TestCase
-
-
-class TestSqueezeOp(TestCase):
- def test_squeeze_all(self):
- # Testing that squeezing without dims works.
- # With dims is covered in hypothesis_test
- data = np.array([[[1]]], dtype=np.int32)
- workspace.FeedBlob('data', data)
- workspace.RunOperatorOnce(core.CreateOperator(
- 'Squeeze', ['data'], ['squeezed']))
- result = workspace.FetchBlob('squeezed')
- assert(np.array_equal(result, 1))
diff --git a/caffe2/python/pipeline.py b/caffe2/python/pipeline.py
new file mode 100644
index 0000000..1f8004a
--- /dev/null
+++ b/caffe2/python/pipeline.py
@@ -0,0 +1,129 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, queue_util
+from caffe2.python.dataio import Reader, Writer
+
+
+def processor_step(
+ reader, writer, num_threads=1, processor=None, name='processor'):
+ """
+ Given a reader and a writer, couple them through a processor, running
+ across multiple threads.
+
+ Args:
+ reader: an instance of dataio.Reader
+ writer: an instance of dataio.Wrier
+ num_threads: number of processing threads
+ processor: if provided, a function taking form:
+ (nets, out_record) = processor(record)
+ where `record` is a schema.Struct containing the input,
+ `nets` is the list of nets doing the transformation, and
+ `out_record` is a schema.Struct with transformed data;
+ name: Name to be given to nets and execution steps created.
+
+ Returns:
+ Execution step that runs all threads of the processor in parallel.
+ """
+ assert isinstance(reader, Reader)
+ assert isinstance(writer, Writer)
+ global_init_net = core.Net(name + '_producer_global_init')
+ global_exit_net = core.Net(name + '_producer_global_exit')
+
+ reader.setup_ex(global_init_net, global_exit_net)
+ writer.setup_ex(global_init_net, global_exit_net)
+
+ def default_processor(fields):
+ return [], fields
+
+ if processor is None:
+ processor = default_processor
+
+ steps = []
+ for thread_id in range(num_threads):
+ init_net = core.Net(name + "_init_net_%d" % thread_id)
+ exit_net = core.Net(name + "_exit_net_%d" % thread_id)
+
+ read_nets, status, rec = reader.read_record_ex(init_net, exit_net)
+ process_nets, rec = processor(rec)
+ write_nets, _ = writer.write_record_ex(rec, init_net, exit_net, status)
+
+ step = core.execution_step(
+ name + "_thread_%d" % thread_id, [
+ core.execution_step(name + "_init_step", init_net),
+ core.execution_step(
+ name + "_worker_step",
+ list(read_nets) + list(process_nets) + list(write_nets),
+ should_stop_blob=status
+ ), core.execution_step(name + "_exit_step", exit_net)
+ ]
+ )
+ steps.append(step)
+
+ return core.execution_step(
+ "sender_step", [
+ core.execution_step('init_step', global_init_net),
+ core.execution_step(
+ "sender_steps", steps, concurrent_substeps=True),
+ core.execution_step('finish_step', global_exit_net),
+ ]
+ )
+
+
+class LocalPipeline(object):
+ """
+ Create a data processing pipeline consisting of a sequence of
+ multi-threaded processors communicating through queues.
+ """
+ def __init__(self):
+ self.tasks = []
+ self.init_net = core.Net('worker_init')
+
+ def create_queue(self, capacity, schema):
+ """
+ Create a queue that will be used to communicate between processors.
+
+ Args:
+ capacity: max number of records in the queue
+ schema: a schema.Struct representing the schema of a record in
+ the queue.
+
+ Returns:
+ A QueueWrapper containing a queue.
+ """
+ return queue_util.QueueWrapper(self.init_net, capacity, schema)
+
+ def add_task(self, task):
+ """
+ Add a task to the pipeline.
+ This task will run in parallel to other tasks in the pipeline.
+ """
+ self.tasks.append(task)
+
+ def link(self, reader, writer, num_threads=1, processor=None):
+ """
+ Add a task that will read from `reader`, and write to `writer`.
+ See function `processor_step` above for description of the arguments.
+ """
+ self.add_task(processor_step(reader, writer, num_threads, processor))
+
+ def get_step(self):
+ """
+ Create and return a Caffe2 execution step that will run all the tasks
+ of this pipeline in parallel.
+ """
+ return core.execution_step('worker_step', [
+ core.execution_step('worker_init', self.init_net),
+ core.execution_step(
+ 'tasks_step', self.tasks, concurrent_substeps=True)
+ ])
+
+ def get_step_and_output(self):
+ """
+ Return a tuple (execution_step, output) to be used as one of the tasks
+ in a distributed pipeline.
+ """
+ output = self.init_net.ConstantFill([], value=0.0)
+ return self.get_step(), [output]
diff --git a/caffe2/python/pybind_state.cc b/caffe2/python/pybind_state.cc
index 68cb2ea..757d6bf 100644
--- a/caffe2/python/pybind_state.cc
+++ b/caffe2/python/pybind_state.cc
@@ -220,11 +220,19 @@
py::gil_scoped_release g;
CAFFE_ENFORCE(self->RunOperatorOnce(proto));
})
- .def("_run_plan", [](Workspace* self, py::bytes def) {
- caffe2::PlanDef proto;
- CAFFE_ENFORCE(proto.ParseFromString(def));
- py::gil_scoped_release g;
- CAFFE_ENFORCE(self->RunPlan(proto));
+ .def(
+ "_run_plan",
+ [](Workspace* self, py::bytes def) {
+ caffe2::PlanDef proto;
+ CAFFE_ENFORCE(proto.ParseFromString(def));
+ py::gil_scoped_release g;
+ CAFFE_ENFORCE(self->RunPlan(proto));
+ })
+ .def_property_readonly_static("current", [](py::object /* type */) {
+ auto ws = gWorkspaces.find(gCurrentWorkspaceName);
+ CAFFE_ENFORCE(ws != gWorkspaces.end());
+ CAFFE_ENFORCE(ws->second.get());
+ return py::cast(ws->second.get(), py::return_value_policy::reference);
});
// Gradients
@@ -369,6 +377,7 @@
},
"Reset the workspace",
py::arg("root_folder") = py::none());
+
m.def("root_folder", []() {
CAFFE_ENFORCE(gWorkspace);
return gWorkspace->RootFolder();
diff --git a/caffe2/python/pybind_state.h b/caffe2/python/pybind_state.h
index 8fb1e91..7f60efb 100644
--- a/caffe2/python/pybind_state.h
+++ b/caffe2/python/pybind_state.h
@@ -65,7 +65,7 @@
pybind11::object Fetch(const Blob& blob) override {
const Tensor<Context>& tensor = blob.Get<Tensor<Context>>();
Context context;
- CHECK_GE(tensor.size(), 0);
+ CAFFE_ENFORCE_GE(tensor.size(), 0, "Trying to fetch unitilized tensor");
std::vector<npy_intp> npy_dims;
for (const auto dim : tensor.dims()) {
npy_dims.push_back(dim);
diff --git a/caffe2/python/queue_util.py b/caffe2/python/queue_util.py
index 1998f97..a703358 100644
--- a/caffe2/python/queue_util.py
+++ b/caffe2/python/queue_util.py
@@ -3,7 +3,64 @@
from __future__ import print_function
from __future__ import unicode_literals
-from caffe2.python import core
+from caffe2.python import core, dataio
+
+
+class QueueReader(dataio.Reader):
+ def __init__(self, queue, num_blobs=None, schema=None):
+ dataio.Reader.__init__(self, schema)
+ assert schema is not None or num_blobs is not None, (
+ 'Either schema or num_blobs must be provided.')
+
+ self.queue = queue
+ self.num_blobs = num_blobs
+
+ if schema is not None:
+ schema_num_blobs = len(schema.field_names())
+ assert num_blobs is None or num_blobs == schema_num_blobs
+ self.num_blobs = schema_num_blobs
+
+ def setup_ex(self, init_net, exit_net):
+ exit_net.CloseBlobsQueue([self.queue], 0)
+
+ def read_ex(self, local_init_net, local_finish_net):
+ dequeue_net = core.Net('dequeue_net')
+ fields, status_blob = dequeue(dequeue_net, self.queue, self.num_blobs)
+ return [dequeue_net], status_blob, fields
+
+
+class QueueWriter(dataio.Writer):
+ def __init__(self, queue):
+ self.queue = queue
+
+ def setup_ex(self, init_net, exit_net):
+ exit_net.CloseBlobsQueue([self.queue], 0)
+
+ def write_ex(self, fields, local_init_net, local_finish_net, status):
+ enqueue_net = core.Net('enqueue_net')
+ enqueue(enqueue_net, self.queue, fields, status)
+ return [enqueue_net]
+
+
+class QueueWrapper(object):
+ def __init__(self, init_net, capacity, schema):
+ self._queue = init_net.CreateBlobsQueue(
+ [],
+ capacity=capacity,
+ num_blobs=len(schema.field_names()))
+ self._schema = schema
+
+ def reader(self):
+ return QueueReader(self._queue, schema=self._schema)
+
+ def writer(self):
+ return QueueWriter(self._queue)
+
+ def queue(self):
+ return self._queue
+
+ def schema(self):
+ return self._schema
def enqueue(net, queue, data_blobs, status=None):
diff --git a/caffe2/python/schema.py b/caffe2/python/schema.py
index ab346a6..336e74a 100644
--- a/caffe2/python/schema.py
+++ b/caffe2/python/schema.py
@@ -37,6 +37,16 @@
return ''
+def _normalize_field(field_or_type_or_blob):
+ """Clones/normalizes a field before adding it to a container."""
+ if isinstance(field_or_type_or_blob, Field):
+ return field_or_type_or_blob.clone()
+ elif type(field_or_type_or_blob) in (type, np.dtype):
+ return Scalar(dtype=field_or_type_or_blob)
+ else:
+ return Scalar(blob=field_or_type_or_blob)
+
+
class Field(object):
"""Represents an abstract field type in a dataset.
"""
@@ -116,9 +126,8 @@
the parent domain.
"""
def __init__(self, values, lengths_blob=None):
- assert isinstance(values, Field)
self.lengths = Scalar(np.int32, lengths_blob)
- self._items = values.clone()
+ self._items = _normalize_field(values)
self.lengths._set_parent(self, 0)
self._items._set_parent(self, 1)
Field.__init__(self, [self.lengths, self._items])
@@ -160,8 +169,7 @@
assert field[0], 'Field names cannot be empty'
assert field[0] != 'lengths', (
'Struct cannot contain a field named `lengths`.')
- assert isinstance(field[1], Field)
- fields = [(name, field.clone()) for name, field in fields]
+ fields = [(name, _normalize_field(field)) for name, field in fields]
for id, (name, field) in enumerate(fields):
field._set_parent(self, id)
self.fields = OrderedDict(fields)
@@ -191,6 +199,16 @@
def clone(self):
return Struct(*self.fields.items())
+ def __getitem__(self, item):
+ if isinstance(item, list) or isinstance(item, tuple):
+ return Struct(*[(
+ self.fields.keys()[k] if isinstance(k, int) else k, self[k])
+ for k in item])
+ elif isinstance(item, int):
+ return self.fields.values()[item]
+ else:
+ return self.fields[item]
+
def __getattr__(self, item):
if item.startswith('__'):
raise AttributeError(item)
@@ -340,6 +358,23 @@
lengths_blob=lengths_blob)
+def Tuple(*fields):
+ """
+ Creates a Struct with default, sequential, field names of given types.
+ """
+ return Struct(*[
+ ('field_%d' % i, field) for i, field in enumerate(fields)])
+
+
+def RawTuple(num_fields):
+ """
+ Creates a tuple of `num_field` untyped scalars.
+ """
+ assert isinstance(num_fields, int)
+ assert num_fields > 0
+ return Tuple(*([np.void] * num_fields))
+
+
def from_dtype(dtype, _outer_shape=()):
"""Constructs a Caffe2 schema from the given numpy's dtype.
diff --git a/caffe2/python/schema_test.py b/caffe2/python/schema_test.py
index c9caf09..aea2c80 100644
--- a/caffe2/python/schema_test.py
+++ b/caffe2/python/schema_test.py
@@ -15,9 +15,69 @@
s = schema.Struct(
('field1', schema.Scalar(dtype=np.int32)),
('field2', schema.List(
- schema.Scalar(dtype=str))))
+ schema.Scalar(dtype=str)))
+ )
s2 = pickle.loads(pickle.dumps(s))
for r in (s, s2):
self.assertTrue(isinstance(r.field1, schema.Scalar))
self.assertTrue(isinstance(r.field2, schema.List))
self.assertTrue(getattr(r, 'non_existent', None) is None)
+
+ def testNormalizeField(self):
+ s = schema.Struct(('field1', np.int32), ('field2', str))
+ self.assertEquals(
+ s,
+ schema.Struct(
+ ('field1', schema.Scalar(dtype=np.int32)),
+ ('field2', schema.Scalar(dtype=str))
+ )
+ )
+
+ def testTuple(self):
+ s = schema.Tuple(np.int32, str, np.float32)
+ s2 = schema.Struct(
+ ('field_0', schema.Scalar(dtype=np.int32)),
+ ('field_1', schema.Scalar(dtype=np.str)),
+ ('field_2', schema.Scalar(dtype=np.float32))
+ )
+ self.assertEquals(s, s2)
+ self.assertEquals(s[0], schema.Scalar(dtype=np.int32))
+ self.assertEquals(s[1], schema.Scalar(dtype=np.str))
+ self.assertEquals(s[2], schema.Scalar(dtype=np.float32))
+ self.assertEquals(
+ s[2, 0],
+ schema.Struct(
+ ('field_2', schema.Scalar(dtype=np.float32)),
+ ('field_0', schema.Scalar(dtype=np.int32)),
+ )
+ )
+ # test iterator behavior
+ for i, (v1, v2) in enumerate(zip(s, s2)):
+ self.assertEquals(v1, v2)
+ self.assertEquals(s[i], v1)
+ self.assertEquals(s2[i], v1)
+
+ def testRawTuple(self):
+ s = schema.RawTuple(2)
+ self.assertEquals(
+ s,
+ schema.Struct(
+ ('field_0', schema.Scalar()),
+ ('field_1', schema.Scalar())))
+ self.assertEquals(s[0], schema.Scalar())
+ self.assertEquals(s[1], schema.Scalar())
+
+ def testStructIndexing(self):
+ s = schema.Struct(
+ ('field1', schema.Scalar(dtype=np.int32)),
+ ('field2', schema.List(schema.Scalar(dtype=str)))
+ )
+ self.assertEquals(s['field2'], s.field2)
+ self.assertEquals(s['field2'], schema.List(schema.Scalar(dtype=str)))
+ self.assertEquals(
+ s['field2', 'field1'],
+ schema.Struct(
+ ('field2', schema.List(schema.Scalar(dtype=str))),
+ ('field1', schema.Scalar(dtype=np.int32)),
+ )
+ )
diff --git a/caffe2/python/scope.py b/caffe2/python/scope.py
index 18b3b42..497507c 100644
--- a/caffe2/python/scope.py
+++ b/caffe2/python/scope.py
@@ -21,6 +21,7 @@
_NAMESCOPE_SEPARATOR = '/'
+# NOTE: using NameScope is NOT thread-safe! (TODO t13621185)
@contextlib.contextmanager
def NameScope(prefix, reset=False):
global NAMESCOPE
diff --git a/caffe2/python/utils.py b/caffe2/python/utils.py
index f055cfd..6e91b46 100644
--- a/caffe2/python/utils.py
+++ b/caffe2/python/utils.py
@@ -42,6 +42,9 @@
if isinstance(value, np.ndarray):
value = value.flatten().tolist()
+ elif isinstance(value, np.generic):
+ # convert numpy scalar to native python type
+ value = np.asscalar(value)
if type(value) is float:
argument.f = value
diff --git a/caffe2/python/workspace.py b/caffe2/python/workspace.py
index 8dc1814..1ce3d625 100644
--- a/caffe2/python/workspace.py
+++ b/caffe2/python/workspace.py
@@ -108,6 +108,9 @@
return obj.SerializeToString()
elif hasattr(obj, 'Proto'):
return obj.Proto().SerializeToString()
+ else:
+ raise ValueError("Unexpected argument to StringfyProto of type " +
+ type(obj).__name__)
def ResetWorkspace(root_folder=None):
@@ -155,8 +158,12 @@
return C.run_net(StringifyNetName(name))
-def RunPlan(plan):
- return C.run_plan(StringfyProto(plan))
+def RunPlan(plan_or_step):
+ # TODO(jiayq): refactor core.py/workspace.py to avoid circular deps
+ import caffe2.python.core as core
+ if isinstance(plan_or_step, core.ExecutionStep):
+ plan_or_step = core.Plan(plan_or_step)
+ return C.run_plan(StringfyProto(plan_or_step))
def _StringifyName(name, expected_type):
diff --git a/caffe2/python/workspace_test.py b/caffe2/python/workspace_test.py
index cd5c117..b3afb7a 100644
--- a/caffe2/python/workspace_test.py
+++ b/caffe2/python/workspace_test.py
@@ -42,6 +42,15 @@
workspace.RunNetOnce(self.net.Proto().SerializeToString()), True)
self.assertEqual(workspace.HasBlob("testblob"), True)
+ def testCurrentWorkspaceWrapper(self):
+ self.assertNotIn("testblob", workspace.C.Workspace.current.blobs)
+ self.assertEqual(
+ workspace.RunNetOnce(self.net.Proto().SerializeToString()), True)
+ self.assertEqual(workspace.HasBlob("testblob"), True)
+ self.assertIn("testblob", workspace.C.Workspace.current.blobs)
+ workspace.ResetWorkspace()
+ self.assertNotIn("testblob", workspace.C.Workspace.current.blobs)
+
def testRunPlan(self):
plan = core.Plan("test-plan")
plan.AddStep(core.ExecutionStep("test-step", self.net))
@@ -49,6 +58,11 @@
workspace.RunPlan(plan.Proto().SerializeToString()), True)
self.assertEqual(workspace.HasBlob("testblob"), True)
+ def testConstructPlanFromSteps(self):
+ step = core.ExecutionStep("test-step-as-plan", self.net)
+ self.assertEqual(workspace.RunPlan(step), True)
+ self.assertEqual(workspace.HasBlob("testblob"), True)
+
def testResetWorkspace(self):
self.assertEqual(
workspace.RunNetOnce(self.net.Proto().SerializeToString()), True)
diff --git a/caffe2/queue/blobs_queue.h b/caffe2/queue/blobs_queue.h
index efd7f22..1bc6a07 100644
--- a/caffe2/queue/blobs_queue.h
+++ b/caffe2/queue/blobs_queue.h
@@ -74,29 +74,26 @@
return true;
}
- bool blockingWrite(const std::vector<Blob*>& inputs) {
+ bool tryWrite(const std::vector<Blob*>& inputs) {
auto keeper = this->shared_from_this();
std::unique_lock<std::mutex> g(mutex_);
- auto canWrite = [this]() {
- // writer is always within [reader, reader + size)
- // we can write if reader is within [reader, reader + size)
- CHECK_LE(reader_, writer_);
- CHECK_LE(writer_, reader_ + queue_.size());
- return writer_ != reader_ + queue_.size();
- };
- cv_.wait(g, [this, canWrite]() { return closing_ || canWrite(); });
if (!canWrite()) {
return false;
}
DCHECK(canWrite());
- auto& result = queue_[writer_ % queue_.size()];
- CAFFE_ENFORCE(inputs.size() >= result.size());
- for (auto i = 0; i < result.size(); ++i) {
- using std::swap;
- swap(*(inputs[i]), *(result[i]));
+ doWrite(inputs);
+ return true;
+ }
+
+ bool blockingWrite(const std::vector<Blob*>& inputs) {
+ auto keeper = this->shared_from_this();
+ std::unique_lock<std::mutex> g(mutex_);
+ cv_.wait(g, [this]() { return closing_ || canWrite(); });
+ if (!canWrite()) {
+ return false;
}
- ++writer_;
- cv_.notify_all();
+ DCHECK(canWrite());
+ doWrite(inputs);
return true;
}
@@ -112,6 +109,25 @@
}
private:
+ bool canWrite() {
+ // writer is always within [reader, reader + size)
+ // we can write if reader is within [reader, reader + size)
+ CHECK_LE(reader_, writer_);
+ CHECK_LE(writer_, reader_ + queue_.size());
+ return writer_ != reader_ + queue_.size();
+ }
+
+ void doWrite(const std::vector<Blob*>& inputs) {
+ auto& result = queue_[writer_ % queue_.size()];
+ CAFFE_ENFORCE(inputs.size() >= result.size());
+ for (auto i = 0; i < result.size(); ++i) {
+ using std::swap;
+ swap(*(inputs[i]), *(result[i]));
+ }
+ ++writer_;
+ cv_.notify_all();
+ }
+
std::atomic<bool> closing_{false};
size_t numBlobs_;
diff --git a/caffe2/queue/queue_ops.cc b/caffe2/queue/queue_ops.cc
index d938394..8b073ed 100644
--- a/caffe2/queue/queue_ops.cc
+++ b/caffe2/queue/queue_ops.cc
@@ -3,6 +3,8 @@
namespace caffe2 {
+CAFFE_KNOWN_TYPE(std::shared_ptr<BlobsQueue>);
+
namespace {
REGISTER_CPU_OPERATOR(CreateBlobsQueue, CreateBlobsQueueOp<CPUContext>);
diff --git a/caffe2/sgd/adagrad_op.h b/caffe2/sgd/adagrad_op.h
index d2917f4..47f5a54 100644
--- a/caffe2/sgd/adagrad_op.h
+++ b/caffe2/sgd/adagrad_op.h
@@ -19,7 +19,7 @@
for (auto i = 0; i < N; ++i) {
float gi = g[i];
float hi = nh[i] = h[i] + gi * gi;
- ng[i] = lr[0] * gi / (sqrt(hi) + epsilon);
+ ng[i] = lr[0] * gi / (std::sqrt(hi) + epsilon);
}
}
@@ -32,12 +32,12 @@
float* nw,
float* nh,
float epsilon,
- const float* lr,
+ float lr,
Context* context) {
for (auto i = 0; i < N; ++i) {
float gi = g[i];
float hi = nh[i] = h[i] + gi * gi;
- nw[i] = w[i] + lr[0] * gi / (sqrt(hi) + epsilon);
+ nw[i] = w[i] + lr * gi / (std::sqrt(hi) + epsilon);
}
}
@@ -61,7 +61,7 @@
Output(OUTPUT_PARAM)->template mutable_data<T>(),
Output(OUTPUT_MOMENT_1)->template mutable_data<T>(),
epsilon_,
- Input(LR).template data<T>(),
+ Input(LR).template data<T>()[0],
&context_);
return true;
}
@@ -92,7 +92,6 @@
Output(OUTPUT_MOMENT_1)->ResizeLike(Input(MOMENT_1));
auto n = Input(GRAD).dim(0);
- auto block_size = Input(GRAD).size() / n;
const auto* indices = Input(INDICES).template data<SIndex>();
const auto* gradIn = Input(GRAD).template data<T>();
@@ -100,12 +99,18 @@
const auto* momentIn = Input(MOMENT_1).template data<T>();
auto* paramOut = Output(OUTPUT_PARAM)->template mutable_data<T>();
auto* momentOut = Output(OUTPUT_MOMENT_1)->template mutable_data<T>();
+
+ if (n == 0) {
+ return true;
+ }
+
+ auto block_size = Input(GRAD).size_from_dim(1);
for (auto i = 0; i < n; ++i) {
auto idx = indices[i];
if (block_size == 1) {
float gi = gradIn[i];
float hi = momentOut[idx] = momentIn[idx] + gi * gi;
- paramOut[idx] = paramIn[idx] + lr[0] * gi / (sqrt(hi) + epsilon_);
+ paramOut[idx] = paramIn[idx] + lr[0] * gi / (std::sqrt(hi) + epsilon_);
} else {
auto offsetI = i * block_size;
auto offsetIdx = idx * block_size;
@@ -117,7 +122,7 @@
paramOut + offsetIdx,
momentOut + offsetIdx,
epsilon_,
- lr,
+ lr[0],
&context_);
}
}
diff --git a/caffe2/sgd/adagrad_op_gpu.cu b/caffe2/sgd/adagrad_op_gpu.cu
index cd5fe5b..dc47127 100644
--- a/caffe2/sgd/adagrad_op_gpu.cu
+++ b/caffe2/sgd/adagrad_op_gpu.cu
@@ -15,7 +15,7 @@
CUDA_1D_KERNEL_LOOP(i, N) {
float gi = g[i];
float hi = nh[i] = h[i] + gi * gi;
- ng[i] = lr[0] * gi / (sqrt(hi) + epsilon);
+ ng[i] = lr[0] * gi / (std::sqrt(hi) + epsilon);
}
}
diff --git a/caffe2/sgd/ftrl_op.cc b/caffe2/sgd/ftrl_op.cc
index f11c934..81e34b5 100644
--- a/caffe2/sgd/ftrl_op.cc
+++ b/caffe2/sgd/ftrl_op.cc
@@ -18,13 +18,13 @@
T& nz,
const FtrlParams<T>& params) {
auto new_n = n + g * g;
- auto sigma = (sqrt(new_n) - sqrt(n)) / params.alpha;
+ auto sigma = (sqrt(new_n) - sqrt(n)) * params.alphaInv;
nn = new_n;
nz = z + g - sigma * w;
// update the weight
if (std::abs(nz) > params.lambda1) {
nw = (params.lambda1 * sgn(nz) - nz) /
- ((params.beta + sqrt(new_n)) / params.alpha + params.lambda2);
+ ((params.beta + sqrt(new_n)) * params.alphaInv + params.lambda2);
} else {
nw = 0.0;
}
diff --git a/caffe2/sgd/ftrl_op.h b/caffe2/sgd/ftrl_op.h
index 1d93530..27a0b09 100644
--- a/caffe2/sgd/ftrl_op.h
+++ b/caffe2/sgd/ftrl_op.h
@@ -7,11 +7,11 @@
template <typename T>
struct FtrlParams {
explicit FtrlParams(OperatorBase* op)
- : alpha(op->GetSingleArgument<float>("alpha", 0.005)),
+ : alphaInv(1.0 / op->GetSingleArgument<float>("alpha", 0.005)),
beta(op->GetSingleArgument<float>("beta", 1.0)),
lambda1(op->GetSingleArgument<float>("lambda1", 0.001)),
lambda2(op->GetSingleArgument<float>("lambda2", 0.001)) {}
- T alpha;
+ T alphaInv;
T beta;
T lambda1;
T lambda2;
diff --git a/caffe2/sgd/iter_op.h b/caffe2/sgd/iter_op.h
index 063bf24..13f025e 100644
--- a/caffe2/sgd/iter_op.h
+++ b/caffe2/sgd/iter_op.h
@@ -34,11 +34,12 @@
bool RunOnDevice() override {
if (InputSize() == 0) {
- LOG(ERROR) << "You are using an old definition of IterOp that will "
- "be deprecated soon. More specifically, IterOp now "
- "requires an explicit in-place input and output.";
if (!OperatorBase::OutputIsType<TensorCPU>(0)) {
// This is the first run; set the iter to start with 0.
+ LOG(ERROR) << "You are using an old definition of IterOp that will "
+ "be deprecated soon. More specifically, IterOp now "
+ "requires an explicit in-place input and output.";
+
auto* output = OperatorBase::Output<TensorCPU>(0);
VLOG(1) << "Initializing iter counter.";
output->Resize(1);
diff --git a/caffe2/sgd/learning_rate_op.h b/caffe2/sgd/learning_rate_op.h
index d28f498..caec15b 100644
--- a/caffe2/sgd/learning_rate_op.h
+++ b/caffe2/sgd/learning_rate_op.h
@@ -19,7 +19,7 @@
"base_lr", FLT_MAX)) {
CHECK_NE(base_lr_, FLT_MAX) << "Base learning rate must be set.";
const string policy = OperatorBase::GetSingleArgument<string>("policy", "");
- CHECK(policy.size()) << "Must specify a learning rate policy.";
+ CAFFE_ENFORCE(policy.size(), "Must specify a learning rate policy.");
if (policy == "fixed") {
functor_.reset(new FixedLearningRate<T>());
} else if (policy == "step") {
diff --git a/caffe2/utils/mkl_utils.h b/caffe2/utils/mkl_utils.h
new file mode 100644
index 0000000..446001f
--- /dev/null
+++ b/caffe2/utils/mkl_utils.h
@@ -0,0 +1,59 @@
+#ifndef CAFFE2_UTILS_MKL_UTILS_H_
+#define CAFFE2_UTILS_MKL_UTILS_H_
+#ifdef CAFFE2_USE_MKL
+
+#include <mkl.h>
+
+#include "caffe2/core/logging.h"
+
+#if INTEL_MKL_VERSION >= 20170000
+#define CAFFE2_HAS_MKL_SGEMM_PACK
+#define CAFFE2_HAS_MKL_DNN
+
+namespace caffe2 {
+namespace mkl {
+
+struct MKLPackedMatrix {
+ char identifier_;
+ char trans_;
+ int m_;
+ int n_;
+ int k_;
+ float alpha_;
+ int ld_;
+ float* data_ = nullptr;
+
+ MKLPackedMatrix(
+ const char identifier,
+ const char trans,
+ const int m,
+ const int n,
+ const int k,
+ const float alpha,
+ const float* src,
+ const int ld)
+ : identifier_(identifier),
+ trans_(trans),
+ m_(m),
+ n_(n),
+ k_(k),
+ alpha_(alpha),
+ ld_(ld) {
+ data_ = sgemm_alloc(&identifier, &m, &n, &k);
+ CAFFE_ENFORCE(data_, "MKL runtime error: cannot allocate sgemm memory.");
+ sgemm_pack(&identifier, &trans, &m, &n, &k, &alpha, src, &ld, data_);
+ }
+
+ ~MKLPackedMatrix() {
+ if (data_) {
+ sgemm_free(data_);
+ }
+ }
+};
+
+} // namespace mkl
+} // namespace caffe2
+
+#endif // INTEL_MKL_VERSION >= 20170000
+#endif // CAFFE2_USE_MKL
+#endif // CAFFE2_UTILS_MKL_UTILS_H_
diff --git a/caffe2/utils/proto_utils.cc b/caffe2/utils/proto_utils.cc
index a65cd7e..8c19dd3 100644
--- a/caffe2/utils/proto_utils.cc
+++ b/caffe2/utils/proto_utils.cc
@@ -111,7 +111,7 @@
void WriteProtoToTextFile(const Message& proto, const char* filename) {
int fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644);
FileOutputStream* output = new FileOutputStream(fd);
- CHECK(google::protobuf::TextFormat::Print(proto, output));
+ CAFFE_ENFORCE(google::protobuf::TextFormat::Print(proto, output));
delete output;
close(fd);
}
@@ -138,7 +138,7 @@
std::unique_ptr<ZeroCopyOutputStream> raw_output(new FileOutputStream(fd));
std::unique_ptr<CodedOutputStream> coded_output(
new CodedOutputStream(raw_output.get()));
- CHECK(proto.SerializeToCodedStream(coded_output.get()));
+ CAFFE_ENFORCE(proto.SerializeToCodedStream(coded_output.get()));
coded_output.reset();
raw_output.reset();
close(fd);
diff --git a/caffe2/utils/proto_utils.h b/caffe2/utils/proto_utils.h
index 8a355d0..2ce6223 100644
--- a/caffe2/utils/proto_utils.h
+++ b/caffe2/utils/proto_utils.h
@@ -73,7 +73,7 @@
using ::google::protobuf::Message;
inline string ProtoDebugString(const Message& proto) {
- return proto.DebugString();
+ return proto.ShortDebugString();
}
bool ReadProtoFromTextFile(const char* filename, Message* proto);
@@ -179,8 +179,9 @@
CAFFE_ENFORCE(arg_map_.count(name), "Cannot find parameter named ", name);
MessageType message;
if (arg_map_.at(name)->has_s()) {
- CHECK(message.ParseFromString(arg_map_.at(name)->s()))
- << "Faild to parse content from the string";
+ CAFFE_ENFORCE(
+ message.ParseFromString(arg_map_.at(name)->s()),
+ "Faild to parse content from the string");
} else {
VLOG(1) << "Return empty message for parameter " << name;
}
@@ -192,8 +193,9 @@
CAFFE_ENFORCE(arg_map_.count(name), "Cannot find parameter named ", name);
vector<MessageType> messages(arg_map_.at(name)->strings_size());
for (int i = 0; i < messages.size(); ++i) {
- CHECK(messages[i].ParseFromString(arg_map_.at(name)->strings(i)))
- << "Faild to parse content from the string";
+ CAFFE_ENFORCE(
+ messages[i].ParseFromString(arg_map_.at(name)->strings(i)),
+ "Faild to parse content from the string");
}
return messages;
}
diff --git a/caffe2/utils/string_utils.cc b/caffe2/utils/string_utils.cc
index 17afd2f..7d4d65a 100644
--- a/caffe2/utils/string_utils.cc
+++ b/caffe2/utils/string_utils.cc
@@ -17,112 +17,4 @@
return pieces;
}
-Tokenizer::Tokenizer(const std::vector<char>& delims, char escape)
- : escape_(escape) {
- reset();
- std::memset(delimTable_, 0, sizeof(delimTable_));
- for (int i = 0; i < delims.size(); ++i) {
- delimTable_[(unsigned char)delims.at(i)] = i + 1;
- }
-}
-
-void Tokenizer::reset() {
- toBeSkipped_ = 0;
- startDelimId_ = 0;
- leftover_.clear();
-}
-
-void Tokenizer::next(char* start, char* end, TokenizedString& tokenized) {
- tokenized.modifiedStrings_.clear();
- tokenized.tokens_.clear();
-
- char* currentStart = start;
- std::string* copied = nullptr;
- if (!leftover_.empty()) {
- tokenized.modifiedStrings_.emplace_back(new std::string());
- copied = tokenized.modifiedStrings_.back().get();
- *copied = std::move(leftover_);
- }
-
- char* ch;
- for (ch = start + toBeSkipped_; ch < end; ++ch) {
- if (*ch == escape_) {
- if (!copied) {
- tokenized.modifiedStrings_.emplace_back(new std::string());
- copied = tokenized.modifiedStrings_.back().get();
- }
- copied->append(currentStart, ch);
- currentStart = ch + 1;
- // skip next character, since it's escaped
- ++ch;
- continue;
- }
- int newDelimId = delimTable_[(unsigned char)*ch];
- if (newDelimId > 0) {
- // found delimiter
- tokenized.tokens_.emplace_back();
- auto& token = tokenized.tokens_.back();
- token.startDelimId = startDelimId_;
- if (copied) {
- copied->append(currentStart, ch);
- const char* c_str = copied->data();
- token.start = c_str;
- token.end = c_str + copied->size();
- } else {
- token.start = currentStart;
- token.end = ch;
- }
- currentStart = ch + 1;
- copied = nullptr;
- startDelimId_ = newDelimId - 1;
- }
- }
- tokenized.lastDelim_ = startDelimId_;
-
- toBeSkipped_ = ch - end;
- if (copied) {
- copied->append(currentStart, end);
- leftover_ = std::move(*copied);
- } else {
- leftover_.assign(currentStart, end);
- }
-}
-
-FileReader::FileReader(const std::string& path, size_t bufferSize)
- : bufferSize_(bufferSize), buffer_(new char[bufferSize]) {
- fd_ = open(path.c_str(), O_RDONLY, 0777);
- if (fd_ < 0) {
- throw std::runtime_error(
- "Error opening file for reading: " + std::string(std::strerror(errno)));
- }
-}
-
-void FileReader::reset() {
- if (lseek(fd_, 0, SEEK_SET) == -1) {
- throw std::runtime_error(
- "Error reseting file cursor: " + std::string(std::strerror(errno)));
- }
-}
-
-FileReader::~FileReader() {
- if (fd_ >= 0) {
- close(fd_);
- }
-}
-
-void FileReader::operator()(CharRange& range) {
- char* buffer = buffer_.get();
- auto numRead = read(fd_, buffer, bufferSize_);
- if (numRead == -1) {
- throw std::runtime_error(
- "Error reading file: " + std::string(std::strerror(errno)));
- }
- if (numRead == 0) {
- range.start = nullptr;
- range.end = nullptr;
- return;
- }
- range.start = buffer;
- range.end = buffer + numRead;
-}
-}
+} // namespace caffe2
diff --git a/caffe2/utils/string_utils.h b/caffe2/utils/string_utils.h
index 0cd727a..02067bc 100644
--- a/caffe2/utils/string_utils.h
+++ b/caffe2/utils/string_utils.h
@@ -8,111 +8,4 @@
std::vector<std::string> split(char separator, const std::string& string);
-struct Token {
- int startDelimId;
- const char* start;
- const char* end;
-};
-
-class TokenizedString {
- // holder for strings that have been modified
- std::vector<std::unique_ptr<std::string>> modifiedStrings_;
- std::vector<Token> tokens_;
- int lastDelim_;
-
- public:
- const std::vector<Token>& tokens() const {
- return tokens_;
- }
- const int lastDelim() const {
- return lastDelim_;
- }
- friend class Tokenizer;
-};
-
-class Tokenizer {
- private:
- int startDelimId_;
- // state of the tokenizer
- std::string leftover_;
- // if we need to skip the first characters of the next batch because
- // e.g. a escape char that was the last character of the last batch.
- int toBeSkipped_;
- int delimTable_[256];
- const char escape_;
-
- public:
- Tokenizer(const std::vector<char>& delimiters, char escape);
- void reset();
- void next(char* start, char* end, TokenizedString& tokenized);
-};
-
-struct CharRange {
- char* start;
- char* end;
-};
-
-struct StringProvider {
- virtual void operator()(CharRange&) = 0;
- virtual void reset() = 0;
- virtual ~StringProvider() {}
-};
-
-class BufferedTokenizer {
- public:
- BufferedTokenizer(const Tokenizer& t, StringProvider* p, int numPasses = 1)
- : provider_(p), tokenizer_(t), tokenIndex_(0), numPasses_(numPasses) {}
-
- bool next(Token& token) {
- CharRange range;
- while (tokenIndex_ >= tokenized_.tokens().size()) {
- range.start = nullptr;
- while (range.start == nullptr && pass_ < numPasses_) {
- (*provider_)(range);
- if (range.start == nullptr) {
- ++pass_;
- if (pass_ < numPasses_) {
- provider_->reset();
- tokenizer_.reset();
- }
- }
- }
- if (range.start == nullptr) {
- return false;
- }
- tokenizer_.next(range.start, range.end, tokenized_);
- tokenIndex_ = 0;
- }
- token = tokenized_.tokens()[tokenIndex_++];
- return true;
- };
-
- int endDelim() const {
- if (tokenIndex_ + 1 < tokenized_.tokens().size()) {
- return tokenized_.tokens()[tokenIndex_ + 1].startDelimId;
- }
- return tokenized_.lastDelim();
- }
-
- private:
- StringProvider* provider_;
- Tokenizer tokenizer_;
- TokenizedString tokenized_;
- int tokenIndex_;
- int numPasses_;
- int pass_{0};
-};
-
-class FileReader : public StringProvider {
- public:
- explicit FileReader(const std::string& path, size_t bufferSize = 65536);
- ~FileReader();
- void operator()(CharRange& range) override;
- void reset() override;
-
- private:
- const size_t bufferSize_;
- int fd_;
- std::unique_ptr<char[]> buffer_;
-};
-}
+} // namespace caffe2
diff --git a/caffe2/utils/zmq_helper.h b/caffe2/utils/zmq_helper.h
index d8d11f9..fbcdd25 100644
--- a/caffe2/utils/zmq_helper.h
+++ b/caffe2/utils/zmq_helper.h
@@ -10,7 +10,7 @@
class ZmqContext {
public:
explicit ZmqContext(int io_threads) : ptr_(zmq_ctx_new()) {
- CHECK(ptr_ != nullptr) << "Failed to create zmq context.";
+ CAFFE_ENFORCE(ptr_ != nullptr, "Failed to create zmq context.");
int rc = zmq_ctx_set(ptr_, ZMQ_IO_THREADS, io_threads);
CHECK_EQ(rc, 0);
rc = zmq_ctx_set(ptr_, ZMQ_MAX_SOCKETS, ZMQ_MAX_SOCKETS_DFLT);
@@ -55,7 +55,7 @@
public:
explicit ZmqSocket(int type)
: context_(1), ptr_(zmq_socket(context_.ptr(), type)) {
- CHECK(ptr_ != nullptr) << "Faild to create zmq socket.";
+ CAFFE_ENFORCE(ptr_ != nullptr, "Faild to create zmq socket.");
}
~ZmqSocket() {
@@ -97,7 +97,7 @@
}
int SendTillSuccess(const string& msg, int flags) {
- CHECK(msg.size()) << "You cannot send an empty message.";
+ CAFFE_ENFORCE(msg.size(), "You cannot send an empty message.");
int nbytes = 0;
do {
nbytes = Send(msg, flags);