| /** |
| * Copyright (c) 2016-present, Facebook, Inc. |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #include <chrono> |
| #include <fstream> |
| #include <string> |
| #include <thread> |
| |
| #include "binaries/benchmark_helper.h" |
| #include "caffe2/core/blob_serialization.h" |
| #ifdef __CUDA_ARCH__ |
| #include "caffe2/core/context_gpu.h" |
| #endif |
| #include "caffe2/core/init.h" |
| #include "caffe2/core/logging.h" |
| #include "caffe2/core/net.h" |
| #include "caffe2/core/operator.h" |
| #include "caffe2/core/tensor_int8.h" |
| #include "caffe2/utils/bench_utils.h" |
| #include "caffe2/utils/string_utils.h" |
| #include "observers/net_observer_reporter_print.h" |
| #include "observers/observer_config.h" |
| #include "observers/perf_observer.h" |
| |
| using std::map; |
| using std::shared_ptr; |
| using std::string; |
| using std::unique_ptr; |
| using std::vector; |
| |
| void observerConfig() { |
| caffe2::ClearGlobalNetObservers(); |
| caffe2::AddGlobalNetObserverCreator([](caffe2::NetBase* subject) { |
| return caffe2::make_unique<caffe2::PerfNetObserver>(subject); |
| }); |
| caffe2::ObserverConfig::setReporter( |
| caffe2::make_unique<caffe2::NetObserverReporterPrint>()); |
| } |
| |
| bool backendCudaSet(const string& backend) { |
| bool run_on_gpu = false; |
| if (backend == "cuda") { |
| #ifdef __CUDA_ARCH__ |
| if (caffe2::HasCudaGPU()) { |
| run_on_gpu = true; |
| } else { |
| CAFFE_THROW("NO GPU support on this host machine"); |
| } |
| #else |
| CAFFE_THROW("NO GPU support"); |
| #endif |
| } |
| return run_on_gpu; |
| } |
| |
| void setDeviceType(caffe2::NetDef* net_def, caffe2::DeviceType& run_dev) { |
| for (int j = 0; j < net_def->op_size(); j++) { |
| caffe2::OperatorDef* op = net_def->mutable_op(j); |
| op->mutable_device_option()->set_device_type(caffe2::TypeToProto(run_dev)); |
| } |
| } |
| |
| void setOperatorEngine(caffe2::NetDef* net_def, const string& backend) { |
| if (backend != "builtin") { |
| string engine = backend == "nnpack" |
| ? "NNPACK" |
| : backend == "eigen" ? "EIGEN" |
| : backend == "mkl" ? "MKLDNN" |
| : backend == "cuda" |
| ? "CUDA" |
| : backend == "dnnlowp" ? "DNNLOWP" |
| : backend == "dnnlowp_acc16" |
| ? "DNNLOWP_ACC16" |
| : backend == "default" ? "" : "NONE"; |
| CAFFE_ENFORCE(engine != "NONE", "Backend is not supported"); |
| for (int i = 0; i < net_def->op_size(); i++) { |
| caffe2::OperatorDef* op_def = net_def->mutable_op(i); |
| op_def->set_engine(engine); |
| } |
| } |
| } |
| |
| void loadInput( |
| shared_ptr<caffe2::Workspace> workspace, |
| const bool run_on_gpu, |
| map<string, caffe2::TensorProtos>& tensor_protos_map, |
| const string& input, |
| const string& input_file, |
| const string& input_dims, |
| const string& input_type) { |
| // Load input. |
| if (input.size()) { |
| vector<string> input_names = caffe2::split(',', input); |
| if (input_file.size()) { |
| vector<string> input_files = caffe2::split(',', input_file); |
| CAFFE_ENFORCE_EQ( |
| input_names.size(), |
| input_files.size(), |
| "Input name and file should have the same number."); |
| for (int i = 0; i < input_names.size(); ++i) { |
| caffe2::TensorProtos tensor_protos; |
| CAFFE_ENFORCE( |
| caffe2::ReadProtoFromFile(input_files[i], &tensor_protos)); |
| workspace->CreateBlob(input_names[i]); |
| tensor_protos_map.insert(std::make_pair(input_names[i], tensor_protos)); |
| } |
| } else if (input_dims.size() || input_type.size()) { |
| CAFFE_ENFORCE_GE( |
| input_dims.size(), |
| 0, |
| "Input dims must be specified when input tensors are used."); |
| CAFFE_ENFORCE_GE( |
| input_type.size(), |
| 0, |
| "Input type must be specified when input tensors are used."); |
| |
| vector<string> input_dims_list = caffe2::split(';', input_dims); |
| CAFFE_ENFORCE_EQ( |
| input_names.size(), |
| input_dims_list.size(), |
| "Input name and dims should have the same number of items."); |
| vector<string> input_type_list = caffe2::split(';', input_type); |
| CAFFE_ENFORCE_EQ( |
| input_names.size(), |
| input_type_list.size(), |
| "Input name and type should have the same number of items."); |
| for (size_t i = 0; i < input_names.size(); ++i) { |
| vector<string> input_dims_str = caffe2::split(',', input_dims_list[i]); |
| vector<int> input_dims; |
| for (const string& s : input_dims_str) { |
| input_dims.push_back(caffe2::stoi(s)); |
| } |
| caffe2::Blob* blob = workspace->GetBlob(input_names[i]); |
| if (blob == nullptr) { |
| blob = workspace->CreateBlob(input_names[i]); |
| } |
| if (run_on_gpu) { |
| LOG(INFO) << "Running on GPU."; |
| #ifdef __CUDA_ARCH__ |
| caffe2::TensorCUDA* tensor = blob->GetMutable<caffe2::TensorCUDA>(); |
| CHECK_NOTNULL(tensor); |
| tensor->Resize(input_dims); |
| if (input_type_list[i] == "uint8_t") { |
| tensor->mutable_data<uint8_t>(); |
| } else if (input_type_list[i] == "float") { |
| tensor->mutable_data<float>(); |
| } else { |
| CAFFE_THROW("Unsupported input type: ", input_type_list[i]); |
| } |
| #else |
| CAFFE_THROW("Not support GPU on mobile."); |
| #endif |
| } else { |
| if (input_type_list[i] == "uint8_t") { |
| caffe2::int8::Int8TensorCPU* tensor = |
| blob->GetMutable<caffe2::int8::Int8TensorCPU>(); |
| CHECK_NOTNULL(tensor); |
| tensor->t.Resize(input_dims); |
| tensor->t.mutable_data<uint8_t>(); |
| } else if (input_type_list[i] == "float") { |
| caffe2::TensorCPU* tensor = BlobGetMutableTensor(blob, caffe2::CPU); |
| CHECK_NOTNULL(tensor); |
| tensor->Resize(input_dims); |
| tensor->mutable_data<float>(); |
| } else { |
| CAFFE_THROW("Unsupported input type: ", input_type_list[i]); |
| } |
| } |
| } |
| } else { |
| CAFFE_THROW( |
| "You requested input tensors, but neither input_file nor " |
| "input_dims is set."); |
| } |
| } |
| } |
| |
| void fillInputBlob( |
| shared_ptr<caffe2::Workspace> workspace, |
| map<string, caffe2::TensorProtos>& tensor_protos_map, |
| int iteration) { |
| if (tensor_protos_map.empty()) { |
| return; |
| } |
| static caffe2::TensorDeserializer serializer; |
| for (auto& tensor_kv : tensor_protos_map) { |
| caffe2::Blob* blob = workspace->GetBlob(tensor_kv.first); |
| if (blob == nullptr) { |
| blob = workspace->CreateBlob(tensor_kv.first); |
| } |
| // todo: support gpu and make this function a tempalte |
| int protos_size = tensor_kv.second.protos_size(); |
| caffe2::TensorProto* tensor_proto = |
| tensor_kv.second.mutable_protos(iteration % protos_size); |
| if (tensor_proto->data_type() == caffe2::TensorProto::STRING) { |
| caffe2::TensorCPU* tensor = BlobGetMutableTensor(blob, caffe2::CPU); |
| int total_size = tensor_proto->string_data_size(); |
| for (size_t i = 0; i < total_size; i++) { |
| (tensor->mutable_data<string>())[i] = tensor_proto->string_data(i); |
| } |
| } else if (tensor_proto->data_type() == caffe2::TensorProto::FLOAT) { |
| vector<int64_t> dims; |
| for (const int64_t d : tensor_proto->dims()) { |
| dims.push_back(d); |
| } |
| // int total_size = tensor_proto->float_data_size(); |
| caffe2::TensorCPU* tensor = |
| new caffe2::TensorCPU(dims, caffe2::DeviceType::CPU); |
| serializer.Deserialize(*tensor_proto, tensor); |
| blob->Reset(tensor); |
| } |
| // todo: for other types |
| } |
| } |
| |
| void runNetwork( |
| shared_ptr<caffe2::Workspace> workspace, |
| caffe2::NetDef& net_def, |
| map<string, caffe2::TensorProtos>& tensor_protos_map, |
| const bool wipe_cache, |
| const bool run_individual, |
| const int warmup, |
| const int iter, |
| const int sleep_before_run) { |
| if (!net_def.has_name()) { |
| net_def.set_name("benchmark"); |
| } |
| |
| caffe2::NetBase* net = workspace->CreateNet(net_def); |
| CHECK_NOTNULL(net); |
| |
| LOG(INFO) << "Starting benchmark."; |
| caffe2::ObserverConfig::initSampleRate(1, 1, 1, run_individual, warmup); |
| LOG(INFO) << "Running warmup runs."; |
| for (int i = 0; i < warmup; ++i) { |
| fillInputBlob(workspace, tensor_protos_map, i); |
| CAFFE_ENFORCE(net->Run(), "Warmup run ", i, " has failed."); |
| } |
| |
| if (wipe_cache) { |
| caffe2::wipe_cache(); |
| } |
| if (sleep_before_run > 0) { |
| std::this_thread::sleep_for(std::chrono::seconds(sleep_before_run)); |
| } |
| LOG(INFO) << "Main runs."; |
| CAFFE_ENFORCE( |
| iter >= 0, |
| "Number of main runs should be non negative, provided ", |
| iter, |
| "."); |
| for (int i = 0; i < iter; ++i) { |
| caffe2::ObserverConfig::initSampleRate(1, 1, 1, 0, warmup); |
| fillInputBlob(workspace, tensor_protos_map, i); |
| if (wipe_cache) { |
| caffe2::wipe_cache(); |
| } |
| CAFFE_ENFORCE(net->Run(), "Main run ", i, " has failed."); |
| if (wipe_cache) { |
| caffe2::wipe_cache(); |
| } |
| if (run_individual) { |
| caffe2::ObserverConfig::initSampleRate(1, 1, 1, 1, warmup); |
| CAFFE_ENFORCE(net->Run(), "Main run ", i, " with operator has failed."); |
| } |
| } |
| } |
| |
| void writeOutput( |
| shared_ptr<caffe2::Workspace> workspace, |
| const bool run_on_gpu, |
| const string& output, |
| const string& output_folder, |
| const bool text_output) { |
| string output_prefix = output_folder.size() ? output_folder + "/" : ""; |
| if (output.size()) { |
| vector<string> output_names = caffe2::split(',', output); |
| if (output == "*") { |
| output_names = workspace->Blobs(); |
| } |
| for (const string& name : output_names) { |
| CAFFE_ENFORCE( |
| workspace->HasBlob(name), |
| "You requested a non-existing blob: ", |
| name); |
| if (text_output) { |
| if (run_on_gpu) { |
| #ifdef __CUDA_ARCH__ |
| writeTextOutput<caffe2::CUDAContext, caffe2::TensorCUDA>( |
| workspace->GetBlob(name)->GetMutable<caffe2::TensorCUDA>(), |
| output_prefix, |
| name); |
| #else |
| CAFFE_THROW("Not support GPU."); |
| #endif |
| } else { |
| writeTextOutput<caffe2::CPUContext, caffe2::TensorCPU>( |
| BlobGetMutableTensor(workspace->GetBlob(name), caffe2::CPU), |
| output_prefix, |
| name); |
| } |
| } else { |
| string serialized = SerializeBlob(*workspace->GetBlob(name), name); |
| string output_filename = output_prefix + name; |
| caffe2::WriteStringToFile(serialized, output_filename.c_str()); |
| } |
| } |
| } |
| } |
| |
| int benchmark( |
| int argc, |
| char* argv[], |
| const string& FLAGS_backend, |
| const string& FLAGS_init_net, |
| const string& FLAGS_input, |
| const string& FLAGS_input_dims, |
| const string& FLAGS_input_file, |
| const string& FLAGS_input_type, |
| int FLAGS_iter, |
| const string& FLAGS_net, |
| const string& FLAGS_output, |
| const string& FLAGS_output_folder, |
| bool FLAGS_run_individual, |
| int FLAGS_sleep_before_run, |
| bool FLAGS_text_output, |
| int FLAGS_warmup, |
| bool FLAGS_wipe_cache) { |
| // Check arguments to be correct |
| { |
| // Need to check whether file exists, as the file reader does not assert if |
| // file does not exist |
| std::ifstream net_file(FLAGS_net); |
| CAFFE_ENFORCE(net_file.good()); |
| net_file.close(); |
| |
| std::ifstream init_net_file(FLAGS_init_net); |
| CAFFE_ENFORCE(init_net_file.good()); |
| init_net_file.close(); |
| |
| if (FLAGS_input_file.size() > 0) { |
| vector<string> input_files = caffe2::split(',', FLAGS_input_file); |
| for (auto input_file : input_files) { |
| std::ifstream ifile(input_file); |
| CAFFE_ENFORCE(ifile.good()); |
| ifile.close(); |
| } |
| } |
| } |
| |
| observerConfig(); |
| caffe2::ShowLogInfoToStderr(); |
| |
| auto workspace = std::make_shared<caffe2::Workspace>(new caffe2::Workspace()); |
| bool run_on_gpu = backendCudaSet(FLAGS_backend); |
| // Run initialization network. |
| caffe2::NetDef init_net_def; |
| CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_init_net, &init_net_def)); |
| setOperatorEngine(&init_net_def, FLAGS_backend); |
| CAFFE_ENFORCE(workspace->RunNetOnce(init_net_def)); |
| |
| // Run main network. |
| caffe2::NetDef net_def; |
| CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_net, &net_def)); |
| setOperatorEngine(&net_def, FLAGS_backend); |
| |
| map<string, caffe2::TensorProtos> tensor_protos_map; |
| |
| loadInput( |
| workspace, |
| run_on_gpu, |
| tensor_protos_map, |
| FLAGS_input, |
| FLAGS_input_file, |
| FLAGS_input_dims, |
| FLAGS_input_type); |
| |
| runNetwork( |
| workspace, |
| net_def, |
| tensor_protos_map, |
| FLAGS_wipe_cache, |
| FLAGS_run_individual, |
| FLAGS_warmup, |
| FLAGS_iter, |
| FLAGS_sleep_before_run); |
| |
| writeOutput( |
| workspace, |
| run_on_gpu, |
| FLAGS_output, |
| FLAGS_output_folder, |
| FLAGS_text_output); |
| |
| return 0; |
| } |