caffe2/opt/onnxifi_op.cc - platform/external/pytorch - Git at Google

 #include "caffe2/opt/onnxifi_op.h"
 #include "caffe2/operators/slice_op.h"
 #include "caffe2/opt/bound_shape_inferencer.h"

 #include <c10/util/irange.h>

 namespace caffe2 {

 namespace {

 void setInputTensorDescriptorTypeAndBuffer(
     const Tensor& cpu_tensor,
     onnxTensorDescriptorV1* desc) {
   if (cpu_tensor.template IsType<int32_t>()) {
     desc->dataType = ONNXIFI_DATATYPE_INT32;
     desc->buffer = reinterpret_cast<onnxPointer>(cpu_tensor.data<int32_t>());
   } else if (cpu_tensor.template IsType<c10::Half>()) {
     desc->dataType = ONNXIFI_DATATYPE_FLOAT16;
     desc->buffer = reinterpret_cast<onnxPointer>(cpu_tensor.data<c10::Half>());
   } else if (cpu_tensor.template IsType<float>()) {
     desc->dataType = ONNXIFI_DATATYPE_FLOAT32;
     desc->buffer = reinterpret_cast<onnxPointer>(cpu_tensor.data<float>());
   } else if (cpu_tensor.template IsType<int8_t>()) {
     desc->dataType = ONNXIFI_DATATYPE_INT8;
     desc->buffer = reinterpret_cast<onnxPointer>(cpu_tensor.data<int8_t>());
   } else if (cpu_tensor.template IsType<uint8_t>()) {
     desc->dataType = ONNXIFI_DATATYPE_UINT8;
     desc->buffer = reinterpret_cast<onnxPointer>(cpu_tensor.data<uint8_t>());
   } else if (cpu_tensor.template IsType<int64_t>()) {
     desc->dataType = ONNXIFI_DATATYPE_INT64;
     desc->buffer = reinterpret_cast<onnxPointer>(cpu_tensor.data<int64_t>());
   } else if (cpu_tensor.template IsType<int16_t>()) {
     desc->dataType = ONNXIFI_DATATYPE_INT16;
     desc->buffer = reinterpret_cast<onnxPointer>(cpu_tensor.data<int16_t>());
   } else if (cpu_tensor.template IsType<uint16_t>()) {
     desc->dataType = ONNXIFI_DATATYPE_UINT16;
     desc->buffer = reinterpret_cast<onnxPointer>(cpu_tensor.data<uint16_t>());
   } else {
     CAFFE_THROW(
         "Unsupported tensor type in ONNXIFI: ", cpu_tensor.dtype().name());
   }
 }

 void setInputTensorDescriptorTypeAndBuffer(
     const int8::Int8TensorCPU& cpu_int8tensor,
     onnxTensorDescriptorV1* desc) {
   const Tensor& cpu_tensor = cpu_int8tensor.t;
   if (cpu_tensor.template IsType<uint8_t>()) {
     desc->dataType = ONNXIFI_DATATYPE_UINT8;
     desc->buffer = reinterpret_cast<onnxPointer>(cpu_tensor.data<uint8_t>());
   } else if (cpu_tensor.template IsType<int8_t>()) {
     desc->dataType = ONNXIFI_DATATYPE_INT8;
     desc->buffer = reinterpret_cast<onnxPointer>(cpu_tensor.data<int8_t>());
   } else if (cpu_tensor.template IsType<int32_t>()) {
     desc->dataType = ONNXIFI_DATATYPE_INT32;
     desc->buffer = reinterpret_cast<onnxPointer>(cpu_tensor.data<int32_t>());
   } else {
     CAFFE_THROW(
         "Unsupported Int8Tensor type in ONNXIFI: ", cpu_tensor.dtype().name());
   }
   desc->quantizationParams = 1;
   desc->quantizationAxis = 1;
   desc->scales = &cpu_int8tensor.scale;
   desc->biases = &cpu_int8tensor.zero_point;
 }

 template <typename T>
 void adjustQuantizedOffsetImpl(Tensor* t, uint8_t offset) {
   auto* data = t->mutable_data<T>();
   for (auto i: c10::irange(t->numel())) {
     data[i] -= offset;
   }
 }

 void adjustQuantizedOffset(Tensor* t, uint8_t offset) {
   if (t->template IsType<uint8_t>()) {
     adjustQuantizedOffsetImpl<uint8_t>(t, offset);
   }
 }

 TypeMeta OnnxifiTypeToDataType(uint64_t onnxifi_type) {
   static std::map<uint64_t, TypeMeta> data_type_map{
       {ONNXIFI_DATATYPE_FLOAT32, TypeMeta::Make<float>()},
       {ONNXIFI_DATATYPE_FLOAT16, TypeMeta::Make<c10::Half>()},
       {ONNXIFI_DATATYPE_INT32, TypeMeta::Make<int>()},
       {ONNXIFI_DATATYPE_INT8, TypeMeta::Make<int8_t>()},
       {ONNXIFI_DATATYPE_UINT8, TypeMeta::Make<uint8_t>()},
       {ONNXIFI_DATATYPE_INT64, TypeMeta::Make<int64_t>()},
       {ONNXIFI_DATATYPE_INT16, TypeMeta::Make<int16_t>()},
       {ONNXIFI_DATATYPE_UINT16, TypeMeta::Make<uint16_t>()},
   };
   const auto it = data_type_map.find(onnxifi_type);
   CAFFE_ENFORCE(
       it != data_type_map.end(),
       "Unsupported ONNXIFI data type: ",
       onnxifi_type);
   return it->second;
 }

 void setOutputTensorDescriptorTypeAndBuffer(
     uint64_t onnxifi_type,
     Tensor* cpu_tensor,
     onnxTensorDescriptorV1* desc) {
   desc->dataType = onnxifi_type;
   desc->buffer = reinterpret_cast<onnxPointer>(
       cpu_tensor->raw_mutable_data(OnnxifiTypeToDataType(onnxifi_type)));
 }

 #ifndef C10_MOBILE
 void copyDescriptor(
     const ExternalTensorDescriptor* from,
     onnxTensorDescriptorV1* to) {
   to->dataType = from->dataType;
   to->buffer = from->buffer;
   to->isOffline = from->isOffline;
   to->quantizationParams = from->quantizationParams;
   to->quantizationAxis = from->quantizationAxis;
   to->scales = from->scales;
   to->biases = from->biases;
   to->dimensions = from->dimensions;
   to->shape = from->shape;
 }
 #endif

 void BlobToTensorDescriptor(
     const std::string& name,
     Workspace* ws,
     onnxTensorDescriptorV1* desc,
     std::vector<std::vector<uint64_t>>* shapes,
     std::vector<std::vector<float>>* all_scales,
     std::vector<std::vector<int32_t>>* all_offsets) {
   const Blob* blob = ws->GetBlob(name);
   CAFFE_ENFORCE(blob, "Blob ", name, " doesn't exist");
   const bool is_int8tensor =
       blob->meta().id() == TypeMeta::Id<int8::Int8TensorCPU>();
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   bool is_external_tensor;
 #ifndef C10_MOBILE
   auto function_ptr =
       ExternalTensorFunctionsBaseRegistry()->Create(blob->meta().id());
   is_external_tensor = function_ptr != nullptr;
 #else
   is_external_tensor = false;
 #endif
   // Memory type
   // We only allow weights to be CPU tensor or int8tensor for now
   CAFFE_ENFORCE(
       (BlobIsTensorType(*blob, CPU) || BlobIsInt8TensorCPUType(*blob) ||
        is_external_tensor),
       "Initialization blob ",
       name,
       " needs to be TensorCPU or Int8TensorCPU or Int8FCDNNLowPPackedWeightBlob Based class: ",
       blob->TypeName());
   desc->tag = ONNXIFI_TAG_TENSOR_DESCRIPTOR_V1;
   desc->memoryType = ONNXIFI_MEMORY_TYPE_CPU;
   desc->isOffline = false;

   if (is_int8tensor) {
     // Data type
     const auto& cpu_int8tensor = blob->template Get<int8::Int8TensorCPU>();
     const auto& cpu_tensor = cpu_int8tensor.t;
     setInputTensorDescriptorTypeAndBuffer(cpu_int8tensor, desc);
     // Set dims
     const auto shape = cpu_tensor.sizes();
     desc->dimensions = shape.size();
     shapes->emplace_back(shape.cbegin(), shape.cend());
     desc->shape = shapes->back().data();
   } else if (is_external_tensor) {
 #ifndef C10_MOBILE
     ExternalTensorDescriptor ext_desc;
     function_ptr->SetupExternalTensorDescriptor(
         blob, shapes, all_scales, all_offsets, &ext_desc);
     copyDescriptor(&ext_desc, desc);
 #endif
   } else {
     // Data type
     const auto& cpu_tensor = blob->template Get<TensorCPU>();
     setInputTensorDescriptorTypeAndBuffer(cpu_tensor, desc);
     // Set dims
     const auto shape = cpu_tensor.sizes();
     desc->dimensions = shape.size();
     shapes->emplace_back(shape.cbegin(), shape.cend());
     desc->shape = shapes->back().data();
     desc->quantizationParams = 0;
   }
 }

 uint64_t getOnnxifiDataType(caffe2::TensorProto::DataType t) {
 #define CAFFE2_TO_ONNXIFI_TYPE(x) \
   case (caffe2::TensorProto::x):  \
     return ONNXIFI_DATATYPE_##x
   switch (t) {
     CAFFE2_TO_ONNXIFI_TYPE(INT8);
     CAFFE2_TO_ONNXIFI_TYPE(UINT8);
     CAFFE2_TO_ONNXIFI_TYPE(UINT16);
     CAFFE2_TO_ONNXIFI_TYPE(INT16);
     CAFFE2_TO_ONNXIFI_TYPE(INT32);
     CAFFE2_TO_ONNXIFI_TYPE(INT64);
     CAFFE2_TO_ONNXIFI_TYPE(FLOAT16);
     case (caffe2::TensorProto::FLOAT):
       return ONNXIFI_DATATYPE_FLOAT32;
     default:
       LOG(WARNING) << "Unsupported Caffe2 tensor type: " << t;
       return ONNXIFI_DATATYPE_UNDEFINED;
   }
 #undef CAFFE2_TO_ONNXIFI_TYPE
 }

 } // namespace

 namespace details {
 TensorInfo::TensorInfo(const TensorProto& t)
     : onnxifi_type(getOnnxifiDataType(t.data_type())),
       quantized(false),
       quantizationAxis(0),
       quantizationParams(0) {
   for (const auto d : t.dims()) {
     dims.push_back(d);
   }
 }

 TensorInfo::TensorInfo(const QTensorProto& t)
     : onnxifi_type(getOnnxifiDataType(t.data_type())),
       quantized(true),
       quantizationAxis(t.has_axis() ? t.axis() : 0),
       quantizationParams(t.scales_size() ? t.scales_size() : 1) {
   for (const auto d : t.dims()) {
     dims.push_back(d);
   }
   if (t.scales_size()) {
     for (const auto d : t.scales()) {
       scales.push_back(static_cast<float>(d));
     }
     for (const auto d : t.biases()) {
       biases.push_back(static_cast<int32_t>(d));
     }
   } else {
     scales.push_back(static_cast<float>(t.scale()));
     biases.push_back(static_cast<int32_t>(t.bias()));
   }
 }
 } // namespace details

 template <>
 std::vector<onnxTensorDescriptorV1>
 OnnxifiOp<CPUContext>::buildInitializationList(
     Workspace* ws,
     const std::vector<std::string>& initializers,
     std::vector<std::string>* weight_names,
     std::vector<std::vector<uint64_t>>* weight_shapes,
     std::vector<std::vector<float>>* all_scales,
     std::vector<std::vector<int32_t>>* all_offsets) const {
   std::unordered_set<std::string> initialization_list(
       initializers.begin(), initializers.end());
   const std::vector<string>& ws_blobs = ws->Blobs();
   // Since onnxTensorDescriptorV1.name will point into the memory in
   // weight_names, we need to prevent weight_names from reallocating by
   // reserving enough memory ahead of time
   weight_names->reserve(ws_blobs.size());
   std::vector<onnxTensorDescriptorV1> descs;
   for (const auto& s : ws_blobs) {
     auto it = initialization_list.find(s);
     if (it != initialization_list.end()) {
       weight_names->emplace_back(s);
       onnxTensorDescriptorV1 tensor_desc;
       tensor_desc.name = weight_names->back().c_str();
       BlobToTensorDescriptor(
           s, ws, &tensor_desc, weight_shapes, all_scales, all_offsets);
       descs.push_back(tensor_desc);
       initialization_list.erase(it);
     }
   }
   CAFFE_ENFORCE(initialization_list.empty(), "Unfulfilled initialization list");
   return descs;
 }

 template <>
 details::OutputReshapeInfo OnnxifiOp<CPUContext>::initOutputReshapeInfo()
     const {
   details::OutputReshapeInfo output_reshape_info;
   output_reshape_info.begins.reserve(output_names_.size());
   output_reshape_info.ends.reserve(output_names_.size());
   output_reshape_info.fast_path.reserve(output_names_.size());
   for (auto i: c10::irange(output_names_.size())) {
     const auto it = output_shape_hints_.find(i);
     CAFFE_ENFORCE(
         it != output_shape_hints_.end(),
         "Cannot find output shape hints for ",
         output_names_[i]);
     int64_t num_dims = it->second.dims.size();
     // Initialize the tensors used to slice the output
     output_reshape_info.begins.emplace_back();
     ReinitializeTensor(
         &output_reshape_info.begins.back(),
         {num_dims},
         at::dtype<int32_t>().device(CPU));
     output_reshape_info.ends.emplace_back();
     ReinitializeTensor(
         &output_reshape_info.ends.back(),
         {num_dims},
         at::dtype<int32_t>().device(CPU));
   }
   return output_reshape_info;
 }

 template <>
 template <typename DimContainer>
 void OnnxifiOp<CPUContext>::fillOutputReshapeInfo(
     const DimContainer& real_shape,
     c10::ArrayRef<uint64_t> max_shape,
     details::OutputReshapeInfo& output_reshape_info,
     int currentIndex) {
   CAFFE_ENFORCE_EQ(real_shape.size(), max_shape.size());
   const auto dim_size = real_shape.size();
   auto& begin = output_reshape_info.begins[currentIndex];
   begin.Resize(dim_size);
   int32_t* begin_ptr = begin.template mutable_data<int32_t>();
   auto& end = output_reshape_info.ends[currentIndex];
   end.Resize(dim_size);
   int32_t* end_ptr = end.template mutable_data<int32_t>();
   int32_t mismatch = 0;
   for (auto j: c10::irange(dim_size)) {
     CAFFE_ENFORCE_GE(
         max_shape[j],
         real_shape[j],
         "It is weird that max shape of ",
         output_names_[currentIndex],
         " is smaller than real shape at dim ",
         j,
         " (",
         max_shape[j],
         " vs ",
         real_shape[j],
         ")");
     begin_ptr[j] = 0;
     if (max_shape[j] > static_cast<uint64_t>(real_shape[j])) {
       end_ptr[j] = real_shape[j];
       mismatch += j;
     } else {
       end_ptr[j] = max_shape[j];
     }
   }

   if (dim_size > 0) {
     output_reshape_info.fast_path[currentIndex] = !mismatch;
   } else {
     output_reshape_info.fast_path[currentIndex] = false;
   }
 }

 template <>
 void OnnxifiOp<CPUContext>::extractOutputBatchSizes(int current_batch_size) {
   auto& output_reshape_info =
       output_reshape_info_.emplace(current_batch_size, initOutputReshapeInfo())
           .first->second;

   if (use_passed_output_shapes_) {
     const auto shape_info_it = output_shapes_per_bs_.find(current_batch_size);
     CAFFE_ENFORCE(
         shape_info_it != output_shapes_per_bs_.end(),
         "Unable to find outputs shapes for bs=",
         current_batch_size);
     CAFFE_ENFORCE_EQ(shape_info_it->second.size(), OutputSize());

     for (int i = 0; i < OutputSize(); ++i) {
       fillOutputReshapeInfo(
           shape_info_it->second[i],
           output_shapes_max_bs_[i],
           output_reshape_info,
           i);
     }
   } else {
     BoundShapeSpec spec(current_batch_size, max_seq_size_);
     auto bound_shape_inferencer =
         BoundShapeInferencerRegistry()->Create("C10", spec);
     for (int i = 0; i < InputSize(); ++i) {
       at::IntArrayRef dim0;
       bool quantized = false;
       if (this->template InputIsType<int8::Int8TensorCPU>(i)) {
         const auto& input_tensor_int8 =
             this->template Input<int8::Int8TensorCPU>(i);
         const auto& t0 = input_tensor_int8.t;
         dim0 = t0.sizes();
         quantized = true;
       } else {
         const auto& t0 = Input(i);
         dim0 = t0.sizes();
       }
       TensorShape shape;
       for (const auto d : dim0) {
         shape.add_dims(d);
       }
       std::vector<TensorBoundShape::DimType> dim_type(
           shape.dims_size(), TensorBoundShape_DimType_CONSTANT);
       if (dim_type.size()) {
         dim_type[0] = TensorBoundShape_DimType_BATCH;
       }
       input_shape_info_[input_names_[i]] =
           ShapeInfo(dim_type, std::move(shape), quantized);
     }
     bound_shape_inferencer->InferBoundShapeAndType(
         netdef_, input_shape_info_, nullptr, false);
     const auto& shape_info = bound_shape_inferencer->shape_info();
     for (int i = 0; i < OutputSize(); ++i) {
       const auto find_res = shape_info.find(output_names_[i]);
       CAFFE_ENFORCE(find_res != shape_info.end());
       fillOutputReshapeInfo(
           find_res->second.shape.dims(),
           output_shapes_max_bs_[i],
           output_reshape_info,
           i);
     }
   }
 }

 template <>
 int OnnxifiOp<CPUContext>::extractOutputBatchSizes() {
   if (use_onnx_ || !adjust_output_batch_) {
     return max_batch_size_;
   }

   // Get the real batch size from nominal input. If it's equal to
   // max_batch_size, mark that we don't need to adjust batch size and return.
   // Otherwise, do a pass of shape inference to get the real shapes of the
   // outputs.
   const Tensor* t = nullptr;
   if (this->template InputIsType<int8::Int8TensorCPU>(nominal_batch_idx_)) {
     const auto& input_tensor_int8 =
         this->template Input<int8::Int8TensorCPU>(nominal_batch_idx_);
     t = &input_tensor_int8.t;
   } else {
     t = &Input(nominal_batch_idx_);
   }

   CAFFE_ENFORCE(
       t, "Null input shape tensor ptr. Possibly unsupported tensor type");
   CAFFE_ENFORCE(
       !t->sizes().empty(),
       input_names_[nominal_batch_idx_],
       " cannot be empty");
   const auto dims = t->sizes();
   const int current_batch_size = dims[0];
   if (current_batch_size == max_batch_size_) {
     return max_batch_size_;
   }

   // We still need to adjust output size but we can skip the shape inference as
   // it was done before.
   if (output_reshape_info_.count(current_batch_size)) {
     return current_batch_size;
   }

   extractOutputBatchSizes(current_batch_size);

   return current_batch_size;
 }

 template <>
 void OnnxifiOp<CPUContext>::adjustOutputBatchSizes(int current_batch_size) {
   auto it = output_reshape_info_.find(current_batch_size);
   CAFFE_ENFORCE(
       it != output_reshape_info_.end(),
       "Cannot find current_batch_size ",
       current_batch_size,
       " in output_reshape_info_");
   const auto& output_reshape_info = it->second;
   CPUContext context;
   Tensor tmp(CPU);
   for (int i = 0; i < OutputSize(); ++i) {
     Tensor* output_tensor = quantized_outputs_[i]
         ? (&this->template Output<int8::Int8TensorCPU>(i)->t)
         : Output(i);
     const auto& end = output_reshape_info.ends[i];
     if (output_reshape_info.fast_path[i]) {
       output_tensor->ShrinkTo(end.data<int32_t>()[0]);
     } else {
       // We need to use generic Slice
       SliceImpl<int32_t, CPUContext>(
           &tmp, *output_tensor, output_reshape_info.begins[i], end, &context);
       output_tensor->CopyFrom(tmp);
     }
   }
 }

 template <>
 void OnnxifiOp<CPUContext>::setOutputShapeAndType(
     int output_idx,
     c10::SmallVector<int64_t, 4>& tensor_dims_int64) {
   tensor_dims_int64.clear();
   std::vector<size_t> tensor_dims;
   uint64_t type = ONNXIFI_DATATYPE_FLOAT32;
   const auto it = output_shape_hints_.find(output_idx);
   CAFFE_ENFORCE(
       it != output_shape_hints_.end(),
       "Cannot find shape hint for output: ",
       output_names_[output_idx]);
   const auto& info = it->second;
   std::copy(
       info.dims.begin(), info.dims.end(), std::back_inserter(tensor_dims));
   type = it->second.onnxifi_type;
   auto& tensor_descriptor = output_desc_[output_idx];
   tensor_descriptor.tag = ONNXIFI_TAG_TENSOR_DESCRIPTOR_V1;
   tensor_descriptor.memoryType = ONNXIFI_MEMORY_TYPE_CPU;
   tensor_descriptor.dimensions = tensor_dims.size();
   CAFFE_ENFORCE(
       tensor_descriptor.dimensions != 0, tensor_descriptor.name, " has 0 dim");
   auto& output_shape = output_shapes_max_bs_[output_idx];
   output_shape.clear();
   output_shape.insert(
       output_shape.begin(), tensor_dims.cbegin(), tensor_dims.cend());
   tensor_descriptor.shape = output_shape.data();
   std::copy(
       tensor_dims.cbegin(),
       tensor_dims.cend(),
       std::back_inserter(tensor_dims_int64));

   // Setup the output C2 tensor
   if (!info.quantized) {
     // Normal Tensor
     auto* output_tensor = Output(
         output_idx,
         tensor_dims_int64,
         at::dtype(OnnxifiTypeToDataType(type)).device(CPU));
     setOutputTensorDescriptorTypeAndBuffer(
         type, output_tensor, &tensor_descriptor);
   } else if (info.quantizationParams == 1) {
     // single quantizer, output Int8Tensor
     auto* output_tensor =
         this->template Output<int8::Int8TensorCPU>(output_idx);
     output_tensor->t.Resize(tensor_dims_int64);
     setOutputTensorDescriptorTypeAndBuffer(
         type, &output_tensor->t, &tensor_descriptor);
     tensor_descriptor.quantizationParams = 1;
     tensor_descriptor.quantizationAxis = 1;
     tensor_descriptor.scales = &output_tensor->scale;
     tensor_descriptor.biases = &output_tensor->zero_point;
   } else {
     CAFFE_THROW(
         "OnnxifiOp does not support output tensor with multi-quantization params: ",
         output_names_[output_idx]);
   }
 }

 string mapOnnxStateToString(onnxEventState state) {
   switch (state) {
     case ONNXIFI_EVENT_STATE_NONSIGNALLED:
       return "ONNXIFI_EVENT_STATE_NONSIGNALLED";
     default:
       return "ONNXIFI_EVENT_STATE_STRING_NOT_MAPPED";
   }
 }

 string mapOnnxStatusToString(onnxStatus status) {
   switch (status) {
     case ONNXIFI_STATUS_SUCCESS:
       return "ONNXIFI_STATUS_SUCCESS";
     case ONNXIFI_STATUS_FALLBACK:
       return "ONNXIFI_STATUS_FALLBACK";
     case ONNXIFI_STATUS_INVALID_ID:
       return "ONNXIFI_STATUS_INVALID_ID";
     case ONNXIFI_STATUS_INVALID_SIZE:
       return "ONNXIFI_STATUS_INVALID_SIZE";
     case ONNXIFI_STATUS_INVALID_POINTER:
       return "ONNXIFI_STATUS_INVALID_POINTER";
     case ONNXIFI_STATUS_INVALID_PROTOBUF:
       return "ONNXIFI_STATUS_INVALID_PROTOBUF";
     case ONNXIFI_STATUS_INVALID_MODEL:
       return "ONNXIFI_STATUS_INVALID_MODEL";
     case ONNXIFI_STATUS_INVALID_BACKEND:
       return "ONNXIFI_STATUS_INVALID_BACKEND";
     case ONNXIFI_STATUS_INVALID_GRAPH:
       return "ONNXIFI_STATUS_INVALID_GRAPH";
     case ONNXIFI_STATUS_INVALID_EVENT:
       return "ONNXIFI_STATUS_INVALID_EVENT";
     case ONNXIFI_STATUS_INVALID_STATE:
       return "ONNXIFI_STATUS_INVALID_STATE";
     case ONNXIFI_STATUS_INVALID_NAME:
       return "ONNXIFI_STATUS_INVALID_NAME";
     case ONNXIFI_STATUS_INVALID_SHAPE:
       return "ONNXIFI_STATUS_INVALID_SHAPE";
     case ONNXIFI_STATUS_INVALID_DATATYPE:
       return "ONNXIFI_STATUS_INVALID_DATATYPE";
     case ONNXIFI_STATUS_INVALID_MEMORY_TYPE:
       return "ONNXIFI_STATUS_INVALID_MEMORY_TYPE";
     case ONNXIFI_STATUS_INVALID_MEMORY_LOCATION:
       return "ONNXIFI_STATUS_INVALID_MEMORY_LOCATION";
     case ONNXIFI_STATUS_INVALID_FENCE_TYPE:
       return "ONNXIFI_STATUS_INVALID_FENCE_TYPE";
     case ONNXIFI_STATUS_INVALID_PROPERTY:
       return "ONNXIFI_STATUS_INVALID_PROPERTY";
     case ONNXIFI_STATUS_UNSUPPORTED_TAG:
       return "ONNXIFI_STATUS_UNSUPPORTED_TAG";
     case ONNXIFI_STATUS_UNSUPPORTED_VERSION:
       return "ONNXIFI_STATUS_UNSUPPORTED_VERSION";
     case ONNXIFI_STATUS_UNSUPPORTED_OPERATOR:
       return "ONNXIFI_STATUS_UNSUPPORTED_OPERATOR";
     case ONNXIFI_STATUS_UNSUPPORTED_ATTRIBUTE:
       return "ONNXIFI_STATUS_UNSUPPORTED_ATTRIBUTE";
     case ONNXIFI_STATUS_UNSUPPORTED_SHAPE:
       return "ONNXIFI_STATUS_UNSUPPORTED_SHAPE";
     case ONNXIFI_STATUS_UNSUPPORTED_DATATYPE:
       return "ONNXIFI_STATUS_UNSUPPORTED_DATATYPE";
     case ONNXIFI_STATUS_UNSUPPORTED_MEMORY_TYPE:
       return "ONNXIFI_STATUS_UNSUPPORTED_MEMORY_TYPE";
     case ONNXIFI_STATUS_UNSUPPORTED_FENCE_TYPE:
       return "ONNXIFI_STATUS_UNSUPPORTED_FENCE_TYPE";
     case ONNXIFI_STATUS_UNSUPPORTED_PROPERTY:
       return "ONNXIFI_STATUS_UNSUPPORTED_PROPERTY";
     case ONNXIFI_STATUS_UNIDENTIFIED_NAME:
       return "ONNXIFI_STATUS_UNIDENTIFIED_NAME";
     case ONNXIFI_STATUS_MISMATCHING_SHAPE:
       return "ONNXIFI_STATUS_MISMATCHING_SHAPE";
     case ONNXIFI_STATUS_MISMATCHING_DATATYPE:
       return "ONNXIFI_STATUS_MISMATCHING_DATATYPE";
     case ONNXIFI_STATUS_NO_SYSTEM_MEMORY:
       return "ONNXIFI_STATUS_NO_SYSTEM_MEMORY";
     case ONNXIFI_STATUS_NO_DEVICE_MEMORY:
       return "ONNXIFI_STATUS_NO_DEVICE_MEMORY";
     case ONNXIFI_STATUS_NO_SYSTEM_RESOURCES:
       return "ONNXIFI_STATUS_NO_SYSTEM_RESOURCES";
     case ONNXIFI_STATUS_NO_DEVICE_RESOURCES:
       return "ONNXIFI_STATUS_NO_DEVICE_RESOURCES";
     case ONNXIFI_STATUS_BACKEND_UNAVAILABLE:
       return "ONNXIFI_STATUS_BACKEND_UNAVAILABLE";
     case ONNXIFI_STATUS_INTERNAL_ERROR:
       return "ONNXIFI_STATUS_INTERNAL_ERROR";
     case ONNXIFI_STATUS_FATAL_ERROR:
       return "ONNXIFI_STATUS_FATAL_ERROR";
     default:
       return "ONNXIFI_STATUS_STRING_NOT_MAPPED";
   }
 }

 template <>
 bool OnnxifiOp<CPUContext>::RunOnDevice() {
   CAFFE_ENFORCE_EQ(input_desc_.size(), InputSize());
   for (auto i: c10::irange(InputSize())) {
     auto& tensor_descriptor = input_desc_[i];
     tensor_descriptor.tag = ONNXIFI_TAG_TENSOR_DESCRIPTOR_V1;
     tensor_descriptor.memoryType = ONNXIFI_MEMORY_TYPE_CPU;
     at::IntArrayRef tensor_dims;
     // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
     if (this->template InputIsType<int8::Int8TensorCPU>(i)) {
       const auto& input_tensor_int8 =
           // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
           this->template Input<int8::Int8TensorCPU>(i);
       const auto& cpu_tensor = input_tensor_int8.t;
       tensor_dims = cpu_tensor.sizes();
       setInputTensorDescriptorTypeAndBuffer(
           input_tensor_int8, &tensor_descriptor);
     } else {
       // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
       const auto& input_tensor = Input(i);
       tensor_dims = input_tensor.sizes();
       setInputTensorDescriptorTypeAndBuffer(input_tensor, &tensor_descriptor);
     }
     auto& input_shape = input_shapes_[i];
     input_shape.clear();
     input_shape.insert(
         input_shape.begin(), tensor_dims.cbegin(), tensor_dims.cend());
     tensor_descriptor.dimensions = tensor_dims.size();
     tensor_descriptor.shape = input_shape.data();
   }

   CAFFE_ENFORCE_EQ(output_desc_.size(), OutputSize());
   c10::SmallVector<int64_t, 4> tensor_dims_int64;
   for (auto i: c10::irange(OutputSize())) {
     setOutputShapeAndType(i, tensor_dims_int64);
   }
   bool ext_supported = false;
   onnxMemoryFenceV1 input_fence;
   onnxMemoryFenceV1 output_fence;
   std::vector<int> output_batch_sizes;
   int current_batch_size = max_batch_size_;
 #ifdef ONNXIFI_ENABLE_EXT
   /**
    * If onnxifi extension mode is enabled,
    * and onnxSetIOAndRunGraph is supported in backend,
    * then we run through this workflow;
    * Else we fallback to non-onnxifi-extension workflow.
    **/
   if (onnxSetIOAndRunGraphPointer_ != nullptr) {
     ext_supported = true;
     output_fence.tag = ONNXIFI_TAG_MEMORY_FENCE_V1;
     output_fence.type = ONNXIFI_SYNCHRONIZATION_EVENT;
     traces_.reset();
     if (enable_tracing_) {
       traces_ = std::shared_ptr<onnxTraceEventList>(
           new onnxTraceEventList(), [this](onnxTraceEventList* p) {
             if (p && onnxReleaseTraceEventsPointer_) {
               CAFFE_ENFORCE_EQ(
                   (*onnxReleaseTraceEventsPointer_)(p), ONNXIFI_STATUS_SUCCESS);
             }
             delete p;
           });
       traces_->numEvents = 0;
     }

     const onnxStatus status = (*onnxSetIOAndRunGraphPointer_)(
         graph_,
         input_desc_.size(),
         input_desc_.data(),
         output_desc_.size(),
         output_desc_.data(),
         &output_fence,
         traces_.get());
     CAFFE_ENFORCE_EQ(
         status,
         ONNXIFI_STATUS_SUCCESS,
         "Reason: onnxSetIOAndRunGraph returned status code ",
         mapOnnxStatusToString(status));

     // Check if we should rely on Onnxifi to provide current batch size
     if (use_onnxifi_batch_size_ && onnxGetCurrentBatchSizePointer_ != nullptr) {
       int64_t onnxifiBatchSize;
       if ((*onnxGetCurrentBatchSizePointer_)(&onnxifiBatchSize) == ONNXIFI_STATUS_SUCCESS) {
         current_batch_size = onnxifiBatchSize;

         if (current_batch_size != max_batch_size_ &&
             output_reshape_info_.count(current_batch_size) == 0) {
           extractOutputBatchSizes(current_batch_size);
         }
       } else {
         current_batch_size = extractOutputBatchSizes();
       }
     } else {
       current_batch_size = extractOutputBatchSizes();
     }
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     onnxEventState eventState;
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     onnxStatus eventStatus;
     std::string message;
     size_t messageLength = 512;
     message.resize(messageLength);

     CAFFE_ENFORCE_EQ(
         (*onnxWaitEventForPointer_)(
             output_fence.event,
             timeout_,
             &eventState,
             &eventStatus,
             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
             const_cast<char*>(message.data()),
             &messageLength),
         ONNXIFI_STATUS_SUCCESS);
     CAFFE_ENFORCE_EQ(
         eventState,
         ONNXIFI_EVENT_STATE_SIGNALLED,
         "Onnxifi run timeouted out after ",
         timeout_,
         " ms.",
         "Reason: Onnxifi run returned event state code ",
         mapOnnxStateToString(eventState));
     if (eventStatus != ONNXIFI_STATUS_SUCCESS) {
       if (messageLength == 0) {
         CAFFE_THROW("onnxifi internal error");
       } else {
         CAFFE_THROW(message);
       }
     }
     CAFFE_ENFORCE_EQ(
         lib_->onnxReleaseEvent(output_fence.event), ONNXIFI_STATUS_SUCCESS);
   }
 #endif
   if (!ext_supported) {
     CAFFE_ENFORCE_EQ(
         lib_->onnxSetGraphIO(
             graph_,
             input_desc_.size(),
             input_desc_.data(),
             output_desc_.size(),
             output_desc_.data()),
         ONNXIFI_STATUS_SUCCESS);

     input_fence.tag = ONNXIFI_TAG_MEMORY_FENCE_V1;
     input_fence.type = ONNXIFI_SYNCHRONIZATION_EVENT;
     CAFFE_ENFORCE_EQ(
         lib_->onnxInitEvent(backend_, &input_fence.event),
         ONNXIFI_STATUS_SUCCESS);
     output_fence.tag = ONNXIFI_TAG_MEMORY_FENCE_V1;
     output_fence.type = ONNXIFI_SYNCHRONIZATION_EVENT;

     // Call the async run on backend, signal event on input fence and wait for
     // the event on output fence
     CAFFE_ENFORCE_EQ(
         lib_->onnxRunGraph(graph_, &input_fence, &output_fence),
         ONNXIFI_STATUS_SUCCESS);
     CAFFE_ENFORCE_EQ(
         lib_->onnxSignalEvent(input_fence.event), ONNXIFI_STATUS_SUCCESS);
     current_batch_size = extractOutputBatchSizes();
     CAFFE_ENFORCE_EQ(
         lib_->onnxWaitEvent(output_fence.event), ONNXIFI_STATUS_SUCCESS);

     // Destroy the event objects
     CAFFE_ENFORCE_EQ(
         lib_->onnxReleaseEvent(input_fence.event), ONNXIFI_STATUS_SUCCESS);
     CAFFE_ENFORCE_EQ(
         lib_->onnxReleaseEvent(output_fence.event), ONNXIFI_STATUS_SUCCESS);
   }

   if (adjust_quantized_offset_) {
     for (auto i: c10::irange(OutputSize())) {
       if (quantized_outputs_[i]) {
         auto* int8_tensor = this->template Output<int8::Int8TensorCPU>(i);
         int8_tensor->zero_point += adjust_quantized_offset_;
         adjustQuantizedOffset(&int8_tensor->t, adjust_quantized_offset_);
       }
     }
   }

   if (adjust_output_batch_ && current_batch_size != max_batch_size_) {
     adjustOutputBatchSizes(current_batch_size);
   }
   enable_tracing_ = false;
   return true;
 }

 REGISTER_CPU_OPERATOR(Onnxifi, OnnxifiOp<CPUContext>);
 OPERATOR_SCHEMA(Onnxifi)
     .NumInputs(0, INT_MAX)
     .NumOutputs(0, INT_MAX)
     .SetDoc(R"DOC(
     The Onnxifi operator is a black-box operator to lower the computation to Onnxifi backend
     )DOC")
     .Arg(
         "onnx_model",
         "(string default=\"\") Serialized ONNX model to be converted to backend representation")
     .Arg(
         "initializers",
         "Initialization pair indicating the mapping of the name between NetDef and ONNX model")
     .Arg(
         "output_resize_hints",
         "A list of key/value pairs indicating which input index to look up for real batch size for the given max output batch size");
 } // namespace caffe2