| /** |
| * Copyright (c) 2016-present, Facebook, Inc. |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #include <chrono> |
| #include <fstream> |
| #include <iostream> |
| #include <string> |
| #include <thread> |
| #ifdef _WIN32 |
| #ifndef WIN32_LEAN_AND_MEAN |
| #define WIN32_LEAN_AND_MEAN |
| #endif |
| #include <windows.h> |
| #include <psapi.h> |
| #endif |
| |
| #include <binaries/benchmark_helper.h> |
| #include "caffe2/core/blob_serialization.h" |
| #ifdef __CUDA_ARCH__ |
| #include "caffe2/core/context_gpu.h" |
| #endif |
| #include "caffe2/core/init.h" |
| #include "caffe2/core/logging.h" |
| #include "caffe2/core/net.h" |
| #include "caffe2/core/operator.h" |
| #include "caffe2/core/tensor_int8.h" |
| #include "caffe2/utils/bench_utils.h" |
| #include "caffe2/utils/string_utils.h" |
| #include <observers/net_observer_reporter_print.h> |
| #include <observers/observer_config.h> |
| #include <observers/perf_observer.h> |
| |
| #if defined(TARGET_OS_MAC) || \ |
| defined(TARGET_OS_IPHONE) || \ |
| defined(TARGET_IPHONE_SIMULATOR) |
| #include <malloc/malloc.h> |
| #else |
| #include <malloc.h> |
| #endif |
| |
| |
| void observerConfig() { |
| caffe2::ClearGlobalNetObservers(); |
| caffe2::AddGlobalNetObserverCreator([](caffe2::NetBase* subject) { |
| return std::make_unique<caffe2::PerfNetObserver>(subject); |
| }); |
| caffe2::ObserverConfig::setReporter( |
| std::make_unique<caffe2::NetObserverReporterPrint>()); |
| } |
| |
| bool backendCudaSet(const string& backend) { |
| bool run_on_gpu = false; |
| if (backend == "cuda") { |
| #ifdef __CUDA_ARCH__ |
| if (caffe2::HasCudaGPU()) { |
| run_on_gpu = true; |
| } else { |
| CAFFE_THROW("NO GPU support on this host machine"); |
| } |
| #else |
| CAFFE_THROW("NO GPU support"); |
| #endif |
| } |
| return run_on_gpu; |
| } |
| |
| void setDeviceType(caffe2::NetDef* net_def, caffe2::DeviceType& run_dev) { |
| for (int j = 0; j < net_def->op_size(); j++) { |
| caffe2::OperatorDef* op = net_def->mutable_op(j); |
| op->mutable_device_option()->set_device_type(caffe2::TypeToProto(run_dev)); |
| } |
| } |
| |
| void setOperatorEngine(caffe2::NetDef* net_def, const string& backend) { |
| if (backend != "builtin") { |
| string engine = backend == "nnpack" |
| ? "NNPACK" |
| : backend == "eigen" ? "EIGEN" |
| : backend == "mkl" ? "MKLDNN" |
| : backend == "cuda" |
| ? "CUDA" |
| : backend == "dnnlowp" ? "DNNLOWP" |
| : backend == "dnnlowp_acc16" |
| ? "DNNLOWP_ACC16" |
| : backend == "default" ? "" : "NONE"; |
| CAFFE_ENFORCE(engine != "NONE", "Backend is not supported"); |
| for (int i = 0; i < net_def->op_size(); i++) { |
| caffe2::OperatorDef* op_def = net_def->mutable_op(i); |
| op_def->set_engine(engine); |
| } |
| } |
| } |
| |
| int loadInput( |
| shared_ptr<caffe2::Workspace> workspace, |
| const bool run_on_gpu, |
| map<string, caffe2::TensorProtos>& tensor_protos_map, |
| const string& input, |
| const string& input_file, |
| const string& input_dims, |
| const string& input_type) { |
| // How many input blobs are in the inputs |
| int blob_num = 1; |
| // Load input. |
| if (input.size()) { |
| vector<string> input_names = caffe2::split(',', input); |
| if (input_file.size()) { |
| vector<string> input_files = caffe2::split(',', input_file); |
| CAFFE_ENFORCE_EQ( |
| input_names.size(), |
| input_files.size(), |
| "Input name and file should have the same number."); |
| for (int i = 0; i < input_names.size(); ++i) { |
| caffe2::TensorProtos tensor_protos; |
| CAFFE_ENFORCE( |
| caffe2::ReadProtoFromFile(input_files[i], &tensor_protos)); |
| workspace->CreateBlob(input_names[i]); |
| tensor_protos_map.insert(std::make_pair(input_names[i], tensor_protos)); |
| } |
| // Check that all blobs have the same number of entries |
| blob_num = tensor_protos_map[input_names[0]].protos_size(); |
| for (int i = 1; i < input_names.size(); ++i) { |
| int bnum = tensor_protos_map[input_names[i]].protos_size(); |
| CAFFE_ENFORCE_EQ( |
| blob_num, |
| bnum, |
| "Number of blobs are not the same for all inputs"); |
| } |
| } else if (input_dims.size() || input_type.size()) { |
| CAFFE_ENFORCE_GE( |
| input_dims.size(), |
| 0, |
| "Input dims must be specified when input tensors are used."); |
| CAFFE_ENFORCE_GE( |
| input_type.size(), |
| 0, |
| "Input type must be specified when input tensors are used."); |
| |
| vector<string> input_dims_list = caffe2::split(';', input_dims); |
| CAFFE_ENFORCE_EQ( |
| input_names.size(), |
| input_dims_list.size(), |
| "Input name and dims should have the same number of items."); |
| vector<string> input_type_list = caffe2::split(';', input_type); |
| CAFFE_ENFORCE_EQ( |
| input_names.size(), |
| input_type_list.size(), |
| "Input name and type should have the same number of items."); |
| for (size_t i = 0; i < input_names.size(); ++i) { |
| vector<string> input_dims_str = caffe2::split(',', input_dims_list[i]); |
| vector<int> input_dims; |
| for (const string& s : input_dims_str) { |
| input_dims.push_back(std::stoi(s)); |
| } |
| caffe2::Blob* blob = workspace->GetBlob(input_names[i]); |
| if (blob == nullptr) { |
| blob = workspace->CreateBlob(input_names[i]); |
| } |
| if (run_on_gpu) { |
| LOG(INFO) << "Running on GPU."; |
| #ifdef __CUDA_ARCH__ |
| caffe2::TensorCUDA* tensor = blob->GetMutable<caffe2::TensorCUDA>(); |
| TORCH_CHECK_NOTNULL(tensor); |
| tensor->Resize(input_dims); |
| if (input_type_list[i] == "uint8_t") { |
| tensor->mutable_data<uint8_t>(); |
| } else if (input_type_list[i] == "float") { |
| tensor->mutable_data<float>(); |
| } else { |
| CAFFE_THROW("Unsupported input type: ", input_type_list[i]); |
| } |
| #else |
| CAFFE_THROW("Not support GPU on mobile."); |
| #endif |
| } else { |
| if (input_type_list[i] == "uint8_t") { |
| caffe2::int8::Int8TensorCPU* tensor = |
| blob->GetMutable<caffe2::int8::Int8TensorCPU>(); |
| TORCH_CHECK_NOTNULL(tensor); |
| tensor->t.Resize(input_dims); |
| tensor->t.mutable_data<uint8_t>(); |
| } else if (input_type_list[i] == "float") { |
| caffe2::TensorCPU* tensor = BlobGetMutableTensor(blob, caffe2::CPU); |
| TORCH_CHECK_NOTNULL(tensor); |
| tensor->Resize(input_dims); |
| tensor->mutable_data<float>(); |
| } else if (input_type_list[i] == "int") { |
| caffe2::TensorCPU* tensor = BlobGetMutableTensor(blob, caffe2::CPU); |
| TORCH_CHECK_NOTNULL(tensor); |
| tensor->Resize(input_dims); |
| tensor->mutable_data<int>(); |
| } else { |
| CAFFE_THROW("Unsupported input type: ", input_type_list[i]); |
| } |
| } |
| } |
| } else { |
| CAFFE_THROW( |
| "You requested input tensors, but neither input_file nor " |
| "input_dims is set."); |
| } |
| } |
| return blob_num; |
| } |
| |
| void fillInputBlob( |
| shared_ptr<caffe2::Workspace> workspace, |
| map<string, caffe2::TensorProtos>& tensor_protos_map, |
| int iteration) { |
| if (tensor_protos_map.empty()) { |
| return; |
| } |
| static caffe2::TensorDeserializer deserializer; |
| for (auto& tensor_kv : tensor_protos_map) { |
| caffe2::Blob* blob = workspace->GetBlob(tensor_kv.first); |
| if (blob == nullptr) { |
| blob = workspace->CreateBlob(tensor_kv.first); |
| } |
| // todo: support gpu and make this function a template |
| int protos_size = tensor_kv.second.protos_size(); |
| if (protos_size == 1 && iteration > 0) { |
| // Do not override the input data if there is only one input data, |
| // since it will clear all caches. Rely on wipe_cache to |
| // clear caches |
| continue; |
| } |
| caffe2::TensorProto* tensor_proto = |
| tensor_kv.second.mutable_protos(iteration % protos_size); |
| BlobSetTensor(blob, deserializer.Deserialize(*tensor_proto)); |
| // todo: for other types |
| } |
| } |
| |
| void runNetwork( |
| shared_ptr<caffe2::Workspace> workspace, |
| caffe2::NetBase* net, |
| map<string, caffe2::TensorProtos>& tensor_protos_map, |
| const bool wipe_cache, |
| const bool run_individual, |
| const bool run_on_gpu, |
| const bool text_output, |
| const int warmup, |
| const int iter, |
| const int num_blobs, |
| const int sleep_before_run, |
| const int sleep_between_iteration, |
| const int sleep_between_net_and_operator, |
| const std::string& output, |
| const std::string& output_folder) { |
| |
| LOG(INFO) << "Starting benchmark."; |
| caffe2::ObserverConfig::initSampleRate(1, 1, 1, run_individual, warmup); |
| LOG(INFO) << "Running warmup runs."; |
| for (int i = 0; i < warmup; ++i) { |
| fillInputBlob(workspace, tensor_protos_map, i); |
| CAFFE_ENFORCE(net->Run(), "Warmup run ", i, " has failed."); |
| } |
| |
| if (wipe_cache) { |
| caffe2::wipe_cache(); |
| } |
| if (sleep_before_run > 0) { |
| std::this_thread::sleep_for(std::chrono::seconds(sleep_before_run)); |
| } |
| LOG(INFO) << "Main runs."; |
| CAFFE_ENFORCE( |
| iter >= 0, |
| "Number of main runs should be non negative, provided ", |
| iter, |
| "."); |
| LOG(INFO) << "net runs."; |
| long long duration_sum = 0; |
| for (int i = 0; i < iter; ++i) { |
| caffe2::ObserverConfig::initSampleRate(1, 1, 1, 0, warmup); |
| fillInputBlob(workspace, tensor_protos_map, i); |
| if (wipe_cache) { |
| caffe2::wipe_cache(); |
| } |
| auto start = std::chrono::high_resolution_clock::now(); |
| CAFFE_ENFORCE(net->Run(), "Main run ", i, " has failed."); |
| auto stop = std::chrono::high_resolution_clock::now(); |
| auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start); |
| duration_sum += duration.count(); |
| // Write the output for the first num_blobs times |
| writeOutput( |
| workspace, |
| run_on_gpu, |
| output, |
| output_folder, |
| text_output, |
| i, |
| num_blobs); |
| if (wipe_cache) { |
| caffe2::wipe_cache(); |
| } |
| if (sleep_between_iteration > 0) { |
| std::this_thread::sleep_for( |
| std::chrono::seconds(sleep_between_iteration)); |
| } |
| } |
| std::cout << "Average Duration: " << (duration_sum/iter) << " us" << std::endl; |
| if (run_individual) { |
| LOG(INFO) << "operator runs."; |
| if (sleep_between_net_and_operator > 0) { |
| std::this_thread::sleep_for( |
| std::chrono::seconds(sleep_between_net_and_operator)); |
| } |
| for (int i = 0; i < iter; ++i) { |
| caffe2::ObserverConfig::initSampleRate(1, 1, 1, 1, warmup); |
| fillInputBlob(workspace, tensor_protos_map, i); |
| CAFFE_ENFORCE(net->Run(), "Main run ", i, " with operator has failed."); |
| if (wipe_cache) { |
| caffe2::wipe_cache(); |
| } |
| if (sleep_between_iteration > 0) { |
| std::this_thread::sleep_for( |
| std::chrono::seconds(sleep_between_iteration)); |
| } |
| } |
| } |
| } |
| |
| void writeOutput( |
| shared_ptr<caffe2::Workspace> workspace, |
| const bool run_on_gpu, |
| const string& output, |
| const string& output_folder, |
| const bool text_output, |
| const int index, |
| const int num_blobs) { |
| if (output.size() == 0) { |
| return; |
| } |
| string output_prefix = output_folder.size() ? output_folder + "/" : ""; |
| vector<string> output_names = caffe2::split(',', output); |
| if (output == "*") { |
| output_names = workspace->Blobs(); |
| } |
| for (const string& name : output_names) { |
| CAFFE_ENFORCE( |
| workspace->HasBlob(name), |
| "You requested a non-existing blob: ", |
| name); |
| if (text_output) { |
| if (run_on_gpu) { |
| #ifdef __CUDA_ARCH__ |
| writeTextOutput<caffe2::CUDAContext, caffe2::TensorCUDA>( |
| workspace->GetBlob(name)->GetMutable<caffe2::TensorCUDA>(), |
| output_prefix, |
| name, |
| index, |
| num_blobs); |
| #else |
| CAFFE_THROW("Not support GPU."); |
| #endif |
| } else { |
| writeTextOutput<caffe2::CPUContext, caffe2::TensorCPU>( |
| BlobGetMutableTensor(workspace->GetBlob(name), caffe2::CPU), |
| output_prefix, |
| name, |
| index, |
| num_blobs); |
| } |
| } else { |
| // Do not support multiple entries per blob. |
| CAFFE_ENFORCE( |
| index == 0, |
| "Binary file only support one output."); |
| string serialized = SerializeBlob(*workspace->GetBlob(name), name); |
| string output_filename = output_prefix + name; |
| caffe2::WriteStringToFile(serialized, output_filename.c_str()); |
| } |
| } |
| } |
| |
| void logBenchmarkResult( |
| const std::string& type, |
| const std::string& metric, |
| const std::string& unit, |
| const int value) { |
| LOG(INFO) << caffe2::NetObserverReporterPrint::IDENTIFIER << "{" |
| << "\"type\": \"" << type << "\", " |
| << "\"metric\": \"" << metric << "\", " |
| << "\"unit\": \"" << unit << "\", " |
| << "\"value\": " << c10::to_string(value) << "}\n"; |
| } |
| |
| long getVirtualMemoryIfOptionEnabled(bool FLAGS_measure_memory) { |
| if (FLAGS_measure_memory) { |
| #if defined(TARGET_OS_IPHONE) || \ |
| defined(TARGET_OS_MAC) || \ |
| defined(TARGET_IPHONE_SIMULATOR) |
| malloc_statistics_t stats = {0}; |
| malloc_zone_statistics(nullptr, &stats); |
| return stats.size_allocated; |
| #elif defined(_WIN32) |
| PROCESS_MEMORY_COUNTERS_EX pmc; |
| GetProcessMemoryInfo( |
| GetCurrentProcess(), (PROCESS_MEMORY_COUNTERS*)&pmc, sizeof(pmc)); |
| return pmc.PrivateUsage; |
| #else |
| struct mallinfo info = mallinfo(); |
| return info.uordblks; |
| #endif |
| } |
| |
| return 0; |
| } |
| |
| int benchmark( |
| int argc, |
| char* argv[], |
| const string& FLAGS_backend, |
| const string& FLAGS_init_net, |
| const string& FLAGS_input, |
| const string& FLAGS_input_dims, |
| const string& FLAGS_input_file, |
| const string& FLAGS_input_type, |
| int FLAGS_iter, |
| bool FLAGS_measure_memory, |
| const string& FLAGS_net, |
| const string& FLAGS_output, |
| const string& FLAGS_output_folder, |
| bool FLAGS_run_individual, |
| int FLAGS_sleep_before_run, |
| int FLAGS_sleep_between_iteration, |
| int FLAGS_sleep_between_net_and_operator, |
| bool FLAGS_text_output, |
| int FLAGS_warmup, |
| bool FLAGS_wipe_cache) { |
| // Check arguments to be correct |
| { |
| // Need to check whether file exists, as the file reader does not assert if |
| // file does not exist |
| std::ifstream net_file(FLAGS_net); |
| CAFFE_ENFORCE(net_file.good()); |
| net_file.close(); |
| |
| std::ifstream init_net_file(FLAGS_init_net); |
| CAFFE_ENFORCE(init_net_file.good()); |
| init_net_file.close(); |
| |
| if (FLAGS_input_file.size() > 0) { |
| vector<string> input_files = caffe2::split(',', FLAGS_input_file); |
| for (auto input_file : input_files) { |
| std::ifstream ifile(input_file); |
| CAFFE_ENFORCE(ifile.good()); |
| ifile.close(); |
| } |
| } |
| } |
| |
| observerConfig(); |
| caffe2::ShowLogInfoToStderr(); |
| |
| auto workspace = std::make_shared<caffe2::Workspace>(new caffe2::Workspace()); |
| bool run_on_gpu = backendCudaSet(FLAGS_backend); |
| // Run initialization network, measure resources used. |
| long init_vmem = getVirtualMemoryIfOptionEnabled(FLAGS_measure_memory); |
| caffe2::NetDef init_net_def; |
| CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_init_net, &init_net_def)); |
| setOperatorEngine(&init_net_def, FLAGS_backend); |
| CAFFE_ENFORCE(workspace->RunNetOnce(init_net_def)); |
| init_vmem = getVirtualMemoryIfOptionEnabled(FLAGS_measure_memory) - init_vmem; |
| |
| map<string, caffe2::TensorProtos> tensor_protos_map; |
| int num_blobs = loadInput( |
| workspace, |
| run_on_gpu, |
| tensor_protos_map, |
| FLAGS_input, |
| FLAGS_input_file, |
| FLAGS_input_dims, |
| FLAGS_input_type); |
| |
| // Run main network. |
| long predict_vmem = getVirtualMemoryIfOptionEnabled(FLAGS_measure_memory); |
| caffe2::NetDef net_def; |
| CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_net, &net_def)); |
| setOperatorEngine(&net_def, FLAGS_backend); |
| if (!net_def.has_name()) { |
| net_def.set_name("benchmark"); |
| } |
| caffe2::NetBase* net = workspace->CreateNet(net_def); |
| TORCH_CHECK_NOTNULL(net); |
| runNetwork( |
| workspace, |
| net, |
| tensor_protos_map, |
| FLAGS_wipe_cache, |
| FLAGS_run_individual, |
| run_on_gpu, |
| FLAGS_text_output, |
| FLAGS_warmup, |
| FLAGS_iter, |
| num_blobs, |
| FLAGS_sleep_before_run, |
| FLAGS_sleep_between_iteration, |
| FLAGS_sleep_between_net_and_operator, |
| FLAGS_output, |
| FLAGS_output_folder); |
| predict_vmem = getVirtualMemoryIfOptionEnabled( |
| FLAGS_measure_memory) - predict_vmem; |
| if (FLAGS_measure_memory) { |
| logBenchmarkResult( |
| "NET_", "memory", "kB", (init_vmem + predict_vmem) / 1024); |
| } |
| |
| return 0; |
| } |