| #include "caffe2/operators/onnxifi_op.h" |
| |
| namespace caffe2 { |
| |
| namespace { |
| |
| void SetInputTensorDescriptorTypeAndBuffer( |
| const Tensor& cpu_tensor, |
| onnxTensorDescriptorV1* desc) { |
| if (cpu_tensor.template IsType<float>()) { |
| desc->dataType = ONNXIFI_DATATYPE_FLOAT32; |
| desc->buffer = reinterpret_cast<onnxPointer>(cpu_tensor.data<float>()); |
| } else if (cpu_tensor.template IsType<int32_t>()) { |
| desc->dataType = ONNXIFI_DATATYPE_INT32; |
| desc->buffer = reinterpret_cast<onnxPointer>(cpu_tensor.data<int32_t>()); |
| } else if (cpu_tensor.template IsType<int8_t>()) { |
| desc->dataType = ONNXIFI_DATATYPE_INT8; |
| desc->buffer = reinterpret_cast<onnxPointer>(cpu_tensor.data<int8_t>()); |
| } else if (cpu_tensor.template IsType<uint8_t>()) { |
| desc->dataType = ONNXIFI_DATATYPE_UINT8; |
| desc->buffer = reinterpret_cast<onnxPointer>(cpu_tensor.data<uint8_t>()); |
| } else if (cpu_tensor.template IsType<int64_t>()) { |
| desc->dataType = ONNXIFI_DATATYPE_INT64; |
| desc->buffer = reinterpret_cast<onnxPointer>(cpu_tensor.data<int64_t>()); |
| } else if (cpu_tensor.template IsType<int16_t>()) { |
| desc->dataType = ONNXIFI_DATATYPE_INT16; |
| desc->buffer = reinterpret_cast<onnxPointer>(cpu_tensor.data<int16_t>()); |
| } else if (cpu_tensor.template IsType<uint16_t>()) { |
| desc->dataType = ONNXIFI_DATATYPE_UINT16; |
| desc->buffer = reinterpret_cast<onnxPointer>(cpu_tensor.data<uint16_t>()); |
| } else { |
| CAFFE_THROW( |
| "Unsupported tensor type in ONNXIFI: ", cpu_tensor.dtype().name()); |
| } |
| } |
| |
| void SetInputTensorDescriptorTypeAndBuffer( |
| const int8::Int8TensorCPU& cpu_int8tensor, |
| onnxTensorDescriptorV1* desc) { |
| const Tensor& cpu_tensor = cpu_int8tensor.t; |
| if (cpu_tensor.template IsType<uint8_t>()) { |
| desc->dataType = ONNXIFI_DATATYPE_UINT8; |
| desc->buffer = reinterpret_cast<onnxPointer>(cpu_tensor.data<uint8_t>()); |
| } else if (cpu_tensor.template IsType<int32_t>()) { |
| desc->dataType = ONNXIFI_DATATYPE_INT32; |
| desc->buffer = reinterpret_cast<onnxPointer>(cpu_tensor.data<int32_t>()); |
| } else { |
| CAFFE_THROW( |
| "Unsupported Int8Tensor type in ONNXIFI: ", cpu_tensor.dtype().name()); |
| } |
| desc->quantizationParams = 1; |
| desc->quantizationAxis = 1; |
| desc->scales = &cpu_int8tensor.scale; |
| desc->biases = &cpu_int8tensor.zero_point; |
| } |
| |
| TypeMeta OnnxifiTypeToDataType(uint64_t onnxifi_type) { |
| static std::map<uint64_t, TypeMeta> data_type_map { |
| {ONNXIFI_DATATYPE_FLOAT32, TypeMeta::Make<float>()}, |
| {ONNXIFI_DATATYPE_INT32, TypeMeta::Make<int>()}, |
| {ONNXIFI_DATATYPE_INT8, TypeMeta::Make<int8_t>()}, |
| {ONNXIFI_DATATYPE_UINT8, TypeMeta::Make<uint8_t>()}, |
| {ONNXIFI_DATATYPE_INT64, TypeMeta::Make<int64_t>()}, |
| {ONNXIFI_DATATYPE_INT16, TypeMeta::Make<int16_t>()}, |
| {ONNXIFI_DATATYPE_UINT16, TypeMeta::Make<uint16_t>()}, |
| }; |
| const auto it = data_type_map.find(onnxifi_type); |
| CAFFE_ENFORCE( |
| it != data_type_map.end(), |
| "Unsupported ONNXIFI data type: ", |
| onnxifi_type); |
| return it->second; |
| } |
| |
| void SetOutputTensorDescriptorTypeAndBuffer( |
| uint64_t onnxifi_type, |
| Tensor* cpu_tensor, |
| onnxTensorDescriptorV1* desc) { |
| desc->dataType = onnxifi_type; |
| desc->buffer = reinterpret_cast<onnxPointer>( |
| cpu_tensor->raw_mutable_data(OnnxifiTypeToDataType(onnxifi_type))); |
| } |
| |
| void SetOutputTensorDescriptorTypeAndBuffer( |
| uint64_t onnxifi_type, |
| int8::Int8TensorCPU* cpu_int8tensor, |
| onnxTensorDescriptorV1* desc) { |
| desc->dataType = onnxifi_type; |
| Tensor* cpu_tensor = &(cpu_int8tensor->t); |
| |
| desc->buffer = reinterpret_cast<onnxPointer>( |
| cpu_tensor->raw_mutable_data(OnnxifiTypeToDataType(onnxifi_type))); |
| desc->quantizationParams = 1; |
| desc->quantizationAxis = 1; |
| desc->scales = &cpu_int8tensor->scale; |
| desc->biases = &cpu_int8tensor->zero_point; |
| } |
| void BlobToTensorDescriptor( |
| const std::string& name, |
| Workspace* ws, |
| onnxTensorDescriptorV1* desc, |
| std::vector<std::vector<uint64_t>>* shapes) { |
| const Blob* blob = ws->GetBlob(name); |
| CAFFE_ENFORCE(blob, "Blob ", name, " doesn't exist"); |
| const bool is_int8tensor = |
| blob->meta().id() == TypeMeta::Id<int8::Int8TensorCPU>(); |
| // Memory type |
| // We only allow weights to be CPU tensor or int8tensor for now |
| CAFFE_ENFORCE( |
| (BlobIsTensorType(*blob, CPU) || BlobIsInt8TensorCPUType(*blob)), |
| "Initialization blob ", |
| name, |
| " needs to be TensorCPU or Int8TensorCPU"); |
| desc->tag = ONNXIFI_TAG_TENSOR_DESCRIPTOR_V1; |
| desc->memoryType = ONNXIFI_MEMORY_TYPE_CPU; |
| |
| if (is_int8tensor) { |
| // Data type |
| const auto& cpu_int8tensor = blob->template Get<int8::Int8TensorCPU>(); |
| const auto& cpu_tensor = cpu_int8tensor.t; |
| SetInputTensorDescriptorTypeAndBuffer(cpu_int8tensor, desc); |
| // Set dims |
| const auto shape = cpu_tensor.sizes(); |
| desc->dimensions = shape.size(); |
| shapes->emplace_back(shape.cbegin(), shape.cend()); |
| desc->shape = shapes->back().data(); |
| } else { |
| // Data type |
| const auto& cpu_tensor = blob->template Get<TensorCPU>(); |
| SetInputTensorDescriptorTypeAndBuffer(cpu_tensor, desc); |
| // Set dims |
| const auto shape = cpu_tensor.sizes(); |
| desc->dimensions = shape.size(); |
| shapes->emplace_back(shape.cbegin(), shape.cend()); |
| desc->shape = shapes->back().data(); |
| desc->quantizationParams = 0; |
| } |
| } |
| } // namespace |
| |
| template <> |
| std::vector<onnxTensorDescriptorV1> |
| OnnxifiOp<CPUContext>::buildInitializationList( |
| Workspace* ws, |
| const std::vector<std::string>& initializers, |
| std::vector<std::string>* weight_names, |
| std::vector<std::vector<uint64_t>>* weight_shapes) { |
| std::unordered_set<std::string> initialization_list( |
| initializers.begin(), initializers.end()); |
| const std::vector<string>& ws_blobs = ws->Blobs(); |
| // Since onnxTensorDescriptorV1.name will point into the memory in |
| // weight_names, we need to prevent weight_names from reallocating by |
| // reserving enough memory ahead of time |
| weight_names->reserve(ws_blobs.size()); |
| std::vector<onnxTensorDescriptorV1> descs; |
| for (const auto& s : ws_blobs) { |
| auto it = initialization_list.find(s); |
| if (it != initialization_list.end()) { |
| weight_names->emplace_back(s); |
| onnxTensorDescriptorV1 tensor_desc; |
| tensor_desc.name = weight_names->back().c_str(); |
| BlobToTensorDescriptor(s, ws, &tensor_desc, weight_shapes); |
| descs.push_back(tensor_desc); |
| initialization_list.erase(it); |
| } |
| } |
| CAFFE_ENFORCE(initialization_list.empty(), "Unfulfilled initialization list"); |
| return descs; |
| } |
| |
| template <> |
| std::vector<int> OnnxifiOp<CPUContext>::extractOutputBatchSizes() const { |
| if (!adjust_output_batch_) { |
| return std::vector<int>(); |
| } |
| |
| CAFFE_ENFORCE_EQ( |
| input_shapes_.size(), |
| InputSize(), |
| "Input shapes and input size don't match. ", |
| input_shapes_.size(), |
| " vs ", |
| InputSize()); |
| CAFFE_ENFORCE_EQ( |
| output_shapes_.size(), |
| OutputSize(), |
| "Output shapes and output size don't match. ", |
| output_shapes_.size(), |
| " vs ", |
| OutputSize()); |
| |
| std::vector<int> adjusted_output_batch; |
| for (const auto& shape : output_shapes_) { |
| if (shape.empty()) { |
| adjusted_output_batch.push_back(0); |
| } else { |
| const auto max_output_batch_size = shape.front(); |
| const auto it = batch_pos_map_.find(max_output_batch_size); |
| if (it == batch_pos_map_.end()) { |
| if (use_onnx_) { |
| // For ONNX path, it's possible that we have output batch size that is |
| // unknown, because we handle the second outout of Concat and Split in |
| // ONNX. But for C2 path, we should not meet with this condition. |
| adjusted_output_batch.push_back(0); |
| continue; |
| } else { |
| if (permit_unknown_output_batch_size_) { |
| adjusted_output_batch.push_back(0); |
| continue; |
| } else { |
| CAFFE_THROW( |
| "Unknow output max batch size: ", max_output_batch_size); |
| } |
| } |
| } |
| auto idx = it->second; |
| CAFFE_ENFORCE_LT(idx, input_shapes_.size(), "index out of bound"); |
| const auto& input_shape = input_shapes_[idx]; |
| // If input real batch size and output max size is the same, we don't need |
| // to adjust max batch size of the output |
| if (input_shape.empty() || input_shape.front() == max_output_batch_size) { |
| adjusted_output_batch.push_back(0); |
| } else { |
| adjusted_output_batch.push_back(input_shape.front()); |
| } |
| } |
| } |
| |
| return adjusted_output_batch; |
| } |
| |
| template <> |
| void OnnxifiOp<CPUContext>::maybeAdjustOutputBatchSizes( |
| const std::vector<int>& real_output_batch_sizes) { |
| CAFFE_ENFORCE_EQ(real_output_batch_sizes.size(), output_shapes_.size()); |
| for (int i = 0; i < real_output_batch_sizes.size(); ++i) { |
| if (!real_output_batch_sizes[i]) { |
| continue; |
| } |
| auto* output_tensor = Output(i); |
| output_tensor->ShrinkTo(real_output_batch_sizes[i]); |
| } |
| } |
| |
| template <> |
| bool OnnxifiOp<CPUContext>::RunOnDevice() { |
| CAFFE_ENFORCE_EQ(input_desc_.size(), InputSize()); |
| for (unsigned i = 0U; i < InputSize(); ++i) { |
| const auto& input_tensor = Input(i); |
| const at::IntArrayRef tensor_dims = input_tensor.sizes(); |
| auto& tensor_descriptor = input_desc_[i]; |
| tensor_descriptor.tag = ONNXIFI_TAG_TENSOR_DESCRIPTOR_V1; |
| tensor_descriptor.memoryType = ONNXIFI_MEMORY_TYPE_CPU; |
| tensor_descriptor.dimensions = tensor_dims.size(); |
| auto& input_shape = input_shapes_[i]; |
| input_shape.clear(); |
| input_shape.insert( |
| input_shape.begin(), tensor_dims.cbegin(), tensor_dims.cend()); |
| tensor_descriptor.shape = input_shape.data(); |
| SetInputTensorDescriptorTypeAndBuffer(input_tensor, &tensor_descriptor); |
| } |
| |
| CAFFE_ENFORCE_EQ(output_desc_.size(), OutputSize()); |
| for (unsigned i = 0U; i < OutputSize(); ++i) { |
| tensor_dims_int64_.clear(); |
| std::vector<size_t> tensor_dims; |
| uint64_t type = SetOutputShapeAndType(i, &tensor_dims); |
| auto& tensor_descriptor = output_desc_[i]; |
| tensor_descriptor.tag = ONNXIFI_TAG_TENSOR_DESCRIPTOR_V1; |
| tensor_descriptor.memoryType = ONNXIFI_MEMORY_TYPE_CPU; |
| tensor_descriptor.dimensions = tensor_dims.size(); |
| CAFFE_ENFORCE( |
| tensor_descriptor.dimensions != 0, |
| tensor_descriptor.name, |
| " has 0 dim"); |
| auto& output_shape = output_shapes_[i]; |
| output_shape.clear(); |
| output_shape.insert( |
| output_shape.begin(), tensor_dims.cbegin(), tensor_dims.cend()); |
| tensor_descriptor.shape = output_shape.data(); |
| std::copy( |
| tensor_dims.cbegin(), |
| tensor_dims.cend(), |
| std::back_inserter(tensor_dims_int64_)); |
| auto* output_tensor = Output( |
| i, |
| tensor_dims_int64_, |
| at::dtype(OnnxifiTypeToDataType(type)).device(CPU)); |
| SetOutputTensorDescriptorTypeAndBuffer( |
| type, output_tensor, &tensor_descriptor); |
| } |
| bool ext_supported = false; |
| onnxMemoryFenceV1 input_fence; |
| onnxMemoryFenceV1 output_fence; |
| std::vector<int> output_batch_sizes; |
| #ifdef ONNXIFI_ENABLE_EXT |
| /** |
| * If onnxifi extension mode is enabled, |
| * and onnxSetIOAndRunGraph is supported in backend, |
| * then we run throw this workflow; |
| * Else we fallback to non-onnxifi-extension workflow. |
| **/ |
| if (onnxSetIOAndRunGraphPointer_ != nullptr) { |
| ext_supported = true; |
| output_fence.tag = ONNXIFI_TAG_MEMORY_FENCE_V1; |
| output_fence.type = ONNXIFI_SYNCHRONIZATION_EVENT; |
| if (enable_tracing_) { |
| traces_.reset(); |
| traces_ = std::shared_ptr<onnxTraceEventList>( |
| new onnxTraceEventList(), [this](onnxTraceEventList* p) { |
| if (p && onnxReleaseTraceEventsPointer_) { |
| CAFFE_ENFORCE_EQ( |
| (*onnxReleaseTraceEventsPointer_)(p), ONNXIFI_STATUS_SUCCESS); |
| } |
| delete p; |
| }); |
| traces_->numEvents = 0; |
| } |
| CAFFE_ENFORCE_EQ( |
| (*onnxSetIOAndRunGraphPointer_)( |
| graph_, |
| input_desc_.size(), |
| input_desc_.data(), |
| output_desc_.size(), |
| output_desc_.data(), |
| &output_fence, |
| traces_.get()), |
| ONNXIFI_STATUS_SUCCESS); |
| output_batch_sizes = extractOutputBatchSizes(); |
| CAFFE_ENFORCE_EQ( |
| lib_->onnxWaitEvent(output_fence.event), ONNXIFI_STATUS_SUCCESS); |
| CAFFE_ENFORCE_EQ( |
| lib_->onnxReleaseEvent(output_fence.event), ONNXIFI_STATUS_SUCCESS); |
| } |
| #endif |
| if (!ext_supported) { |
| CAFFE_ENFORCE_EQ( |
| lib_->onnxSetGraphIO( |
| graph_, |
| input_desc_.size(), |
| input_desc_.data(), |
| output_desc_.size(), |
| output_desc_.data()), |
| ONNXIFI_STATUS_SUCCESS); |
| |
| input_fence.tag = ONNXIFI_TAG_MEMORY_FENCE_V1; |
| input_fence.type = ONNXIFI_SYNCHRONIZATION_EVENT; |
| CAFFE_ENFORCE_EQ( |
| lib_->onnxInitEvent(backend_, &input_fence.event), |
| ONNXIFI_STATUS_SUCCESS); |
| output_fence.tag = ONNXIFI_TAG_MEMORY_FENCE_V1; |
| output_fence.type = ONNXIFI_SYNCHRONIZATION_EVENT; |
| |
| // Call the async run on backend, signal event on input fence and wait for |
| // the event on output fence |
| CAFFE_ENFORCE_EQ( |
| lib_->onnxRunGraph(graph_, &input_fence, &output_fence), |
| ONNXIFI_STATUS_SUCCESS); |
| CAFFE_ENFORCE_EQ( |
| lib_->onnxSignalEvent(input_fence.event), ONNXIFI_STATUS_SUCCESS); |
| output_batch_sizes = extractOutputBatchSizes(); |
| CAFFE_ENFORCE_EQ( |
| lib_->onnxWaitEvent(output_fence.event), ONNXIFI_STATUS_SUCCESS); |
| |
| // Destroy the event objects |
| CAFFE_ENFORCE_EQ( |
| lib_->onnxReleaseEvent(input_fence.event), ONNXIFI_STATUS_SUCCESS); |
| CAFFE_ENFORCE_EQ( |
| lib_->onnxReleaseEvent(output_fence.event), ONNXIFI_STATUS_SUCCESS); |
| } |
| |
| if (adjust_output_batch_) { |
| maybeAdjustOutputBatchSizes(output_batch_sizes); |
| } |
| enable_tracing_ = false; |
| return true; |
| } |
| |
| REGISTER_CPU_OPERATOR(Onnxifi, OnnxifiOp<CPUContext>); |
| OPERATOR_SCHEMA(Onnxifi) |
| .NumInputs(0, INT_MAX) |
| .NumOutputs(0, INT_MAX) |
| .SetDoc(R"DOC( |
| The Onnxifi operator is a black-box operator to lower the computation to Onnxifi backend |
| )DOC") |
| .Arg( |
| "onnx_model", |
| "(string default=\"\") Serialized ONNX model to be converted to backend representation") |
| .Arg( |
| "initializers", |
| "Initialization pair indicating the mapping of the name between NetDef and ONNX model") |
| .Arg( |
| "output_resize_hints", |
| "A list of key/value pairs indicating which input index to look up for real batch size for the given max output batch size"); |
| } // namespace caffe2 |