caffe2/opt/onnxifi_op.cc - platform/external/pytorch - Git at Google

 #include "caffe2/operators/slice_op.h"
 #include "caffe2/opt/bound_shape_inferencer.h"
 #include "caffe2/opt/onnxifi_op.h"

 namespace caffe2 {

 namespace {

 void setInputTensorDescriptorTypeAndBuffer(
     const Tensor& cpu_tensor,
     onnxTensorDescriptorV1* desc) {
   if (cpu_tensor.template IsType<int32_t>()) {
     desc->dataType = ONNXIFI_DATATYPE_INT32;
     desc->buffer = reinterpret_cast<onnxPointer>(cpu_tensor.data<int32_t>());
   } else if (cpu_tensor.template IsType<c10::Half>()) {
     desc->dataType = ONNXIFI_DATATYPE_FLOAT16;
     desc->buffer = reinterpret_cast<onnxPointer>(cpu_tensor.data<c10::Half>());
   } else if (cpu_tensor.template IsType<float>()) {
     desc->dataType = ONNXIFI_DATATYPE_FLOAT32;
     desc->buffer = reinterpret_cast<onnxPointer>(cpu_tensor.data<float>());
   } else if (cpu_tensor.template IsType<int8_t>()) {
     desc->dataType = ONNXIFI_DATATYPE_INT8;
     desc->buffer = reinterpret_cast<onnxPointer>(cpu_tensor.data<int8_t>());
   } else if (cpu_tensor.template IsType<uint8_t>()) {
     desc->dataType = ONNXIFI_DATATYPE_UINT8;
     desc->buffer = reinterpret_cast<onnxPointer>(cpu_tensor.data<uint8_t>());
   } else if (cpu_tensor.template IsType<int64_t>()) {
     desc->dataType = ONNXIFI_DATATYPE_INT64;
     desc->buffer = reinterpret_cast<onnxPointer>(cpu_tensor.data<int64_t>());
   } else if (cpu_tensor.template IsType<int16_t>()) {
     desc->dataType = ONNXIFI_DATATYPE_INT16;
     desc->buffer = reinterpret_cast<onnxPointer>(cpu_tensor.data<int16_t>());
   } else if (cpu_tensor.template IsType<uint16_t>()) {
     desc->dataType = ONNXIFI_DATATYPE_UINT16;
     desc->buffer = reinterpret_cast<onnxPointer>(cpu_tensor.data<uint16_t>());
   } else {
     CAFFE_THROW(
         "Unsupported tensor type in ONNXIFI: ", cpu_tensor.dtype().name());
   }
 }

 void setInputTensorDescriptorTypeAndBuffer(
     const int8::Int8TensorCPU& cpu_int8tensor,
     onnxTensorDescriptorV1* desc) {
   const Tensor& cpu_tensor = cpu_int8tensor.t;
   if (cpu_tensor.template IsType<uint8_t>()) {
     desc->dataType = ONNXIFI_DATATYPE_UINT8;
     desc->buffer = reinterpret_cast<onnxPointer>(cpu_tensor.data<uint8_t>());
   } else if (cpu_tensor.template IsType<int32_t>()) {
     desc->dataType = ONNXIFI_DATATYPE_INT32;
     desc->buffer = reinterpret_cast<onnxPointer>(cpu_tensor.data<int32_t>());
   } else {
     CAFFE_THROW(
         "Unsupported Int8Tensor type in ONNXIFI: ", cpu_tensor.dtype().name());
   }
   desc->quantizationParams = 1;
   desc->quantizationAxis = 1;
   desc->scales = &cpu_int8tensor.scale;
   desc->biases = &cpu_int8tensor.zero_point;
 }

 template <typename T>
 void adjustQuantizedOffsetImpl(Tensor* t, uint8_t offset) {
   auto* data = t->mutable_data<T>();
   for (size_t i = 0; i < t->numel(); ++i) {
     data[i] -= offset;
   }
 }

 void adjustQuantizedOffset(Tensor* t, uint8_t offset) {
   if (t->template IsType<uint8_t>()) {
     adjustQuantizedOffsetImpl<uint8_t>(t, offset);
   }
 }

 TypeMeta OnnxifiTypeToDataType(uint64_t onnxifi_type) {
   static std::map<uint64_t, TypeMeta> data_type_map{
       {ONNXIFI_DATATYPE_FLOAT32, TypeMeta::Make<float>()},
       {ONNXIFI_DATATYPE_FLOAT16, TypeMeta::Make<c10::Half>()},
       {ONNXIFI_DATATYPE_INT32, TypeMeta::Make<int>()},
       {ONNXIFI_DATATYPE_INT8, TypeMeta::Make<int8_t>()},
       {ONNXIFI_DATATYPE_UINT8, TypeMeta::Make<uint8_t>()},
       {ONNXIFI_DATATYPE_INT64, TypeMeta::Make<int64_t>()},
       {ONNXIFI_DATATYPE_INT16, TypeMeta::Make<int16_t>()},
       {ONNXIFI_DATATYPE_UINT16, TypeMeta::Make<uint16_t>()},
   };
   const auto it = data_type_map.find(onnxifi_type);
   CAFFE_ENFORCE(
       it != data_type_map.end(),
       "Unsupported ONNXIFI data type: ",
       onnxifi_type);
   return it->second;
 }

 void setOutputTensorDescriptorTypeAndBuffer(
     uint64_t onnxifi_type,
     Tensor* cpu_tensor,
     onnxTensorDescriptorV1* desc) {
   desc->dataType = onnxifi_type;
   desc->buffer = reinterpret_cast<onnxPointer>(
       cpu_tensor->raw_mutable_data(OnnxifiTypeToDataType(onnxifi_type)));
 }

 #ifndef C10_MOBILE
 void copyDescriptor(
     const ExternalTensorDescriptor* from,
     onnxTensorDescriptorV1* to) {
   to->dataType = from->dataType;
   to->buffer = from->buffer;
   to->isOffline = from->isOffline;
   to->quantizationParams = from->quantizationParams;
   to->quantizationAxis = from->quantizationAxis;
   to->scales = from->scales;
   to->biases = from->biases;
   to->dimensions = from->dimensions;
   to->shape = from->shape;
 }
 #endif

 void BlobToTensorDescriptor(
     const std::string& name,
     Workspace* ws,
     onnxTensorDescriptorV1* desc,
     std::vector<std::vector<uint64_t>>* shapes,
     std::vector<std::vector<float>>* all_scales,
     std::vector<std::vector<int32_t>>* all_offsets) {
   const Blob* blob = ws->GetBlob(name);
   CAFFE_ENFORCE(blob, "Blob ", name, " doesn't exist");
   const bool is_int8tensor =
       blob->meta().id() == TypeMeta::Id<int8::Int8TensorCPU>();
   bool is_external_tensor;
 #ifndef C10_MOBILE
   auto function_ptr =
       ExternalTensorFunctionsBaseRegistry()->Create(blob->meta().id());
   is_external_tensor = function_ptr != nullptr;
 #else
   is_external_tensor = false;
 #endif
   // Memory type
   // We only allow weights to be CPU tensor or int8tensor for now
   CAFFE_ENFORCE(
       (BlobIsTensorType(*blob, CPU) || BlobIsInt8TensorCPUType(*blob) ||
        is_external_tensor),
       "Initialization blob ",
       name,
       " needs to be TensorCPU or Int8TensorCPU or Int8FCDNNLowPPackedWeightBlob Based class: ",
       blob->TypeName());
   desc->tag = ONNXIFI_TAG_TENSOR_DESCRIPTOR_V1;
   desc->memoryType = ONNXIFI_MEMORY_TYPE_CPU;
   desc->isOffline = false;

   if (is_int8tensor) {
     // Data type
     const auto& cpu_int8tensor = blob->template Get<int8::Int8TensorCPU>();
     const auto& cpu_tensor = cpu_int8tensor.t;
     setInputTensorDescriptorTypeAndBuffer(cpu_int8tensor, desc);
     // Set dims
     const auto shape = cpu_tensor.sizes();
     desc->dimensions = shape.size();
     shapes->emplace_back(shape.cbegin(), shape.cend());
     desc->shape = shapes->back().data();
   } else if (is_external_tensor) {
 #ifndef C10_MOBILE
     ExternalTensorDescriptor ext_desc;
     function_ptr->SetupExternalTensorDescriptor(
         blob, shapes, all_scales, all_offsets, &ext_desc);
     copyDescriptor(&ext_desc, desc);
 #endif
   } else {
     // Data type
     const auto& cpu_tensor = blob->template Get<TensorCPU>();
     setInputTensorDescriptorTypeAndBuffer(cpu_tensor, desc);
     // Set dims
     const auto shape = cpu_tensor.sizes();
     desc->dimensions = shape.size();
     shapes->emplace_back(shape.cbegin(), shape.cend());
     desc->shape = shapes->back().data();
     desc->quantizationParams = 0;
   }
 }

 uint64_t getOnnxifiDataType(caffe2::TensorProto::DataType t) {
 #define CAFFE2_TO_ONNXIFI_TYPE(x) \
   case (caffe2::TensorProto::x):  \
     return ONNXIFI_DATATYPE_##x
   switch (t) {
     CAFFE2_TO_ONNXIFI_TYPE(INT8);
     CAFFE2_TO_ONNXIFI_TYPE(UINT8);
     CAFFE2_TO_ONNXIFI_TYPE(UINT16);
     CAFFE2_TO_ONNXIFI_TYPE(INT16);
     CAFFE2_TO_ONNXIFI_TYPE(INT32);
     CAFFE2_TO_ONNXIFI_TYPE(INT64);
     CAFFE2_TO_ONNXIFI_TYPE(FLOAT16);
     case (caffe2::TensorProto::FLOAT):
       return ONNXIFI_DATATYPE_FLOAT32;
     default:
       LOG(WARNING) << "Unsupported Caffe2 tensor type: " << t;
       return ONNXIFI_DATATYPE_UNDEFINED;
   }
 #undef CAFFE2_TO_ONNXIFI_TYPE
 }

 } // namespace

 namespace details {
 TensorInfo::TensorInfo(const TensorProto& t)
     : onnxifi_type(getOnnxifiDataType(t.data_type())),
       quantized(false),
       quantizationAxis(0),
       quantizationParams(0) {
   for (const auto d : t.dims()) {
     dims.push_back(d);
   }
 }

 TensorInfo::TensorInfo(const QTensorProto& t)
     : onnxifi_type(getOnnxifiDataType(t.data_type())),
       quantized(true),
       quantizationAxis(t.has_axis() ? t.axis() : 0),
       quantizationParams(t.scales_size() ? t.scales_size() : 1) {
   for (const auto d : t.dims()) {
     dims.push_back(d);
   }
   if (t.scales_size()) {
     for (const auto d : t.scales()) {
       scales.push_back(static_cast<float>(d));
     }
     for (const auto d : t.biases()) {
       biases.push_back(static_cast<int32_t>(d));
     }
   } else {
     scales.push_back(static_cast<float>(t.scale()));
     biases.push_back(static_cast<int32_t>(t.bias()));
   }
 }
 } // namespace details

 template <>
 std::vector<onnxTensorDescriptorV1>
 OnnxifiOp<CPUContext>::buildInitializationList(
     Workspace* ws,
     const std::vector<std::string>& initializers,
     std::vector<std::string>* weight_names,
     std::vector<std::vector<uint64_t>>* weight_shapes,
     std::vector<std::vector<float>>* all_scales,
     std::vector<std::vector<int32_t>>* all_offsets) const {
   std::unordered_set<std::string> initialization_list(
       initializers.begin(), initializers.end());
   const std::vector<string>& ws_blobs = ws->Blobs();
   // Since onnxTensorDescriptorV1.name will point into the memory in
   // weight_names, we need to prevent weight_names from reallocating by
   // reserving enough memory ahead of time
   weight_names->reserve(ws_blobs.size());
   std::vector<onnxTensorDescriptorV1> descs;
   for (const auto& s : ws_blobs) {
     auto it = initialization_list.find(s);
     if (it != initialization_list.end()) {
       weight_names->emplace_back(s);
       onnxTensorDescriptorV1 tensor_desc;
       tensor_desc.name = weight_names->back().c_str();
       BlobToTensorDescriptor(
           s, ws, &tensor_desc, weight_shapes, all_scales, all_offsets);
       descs.push_back(tensor_desc);
       initialization_list.erase(it);
     }
   }
   CAFFE_ENFORCE(initialization_list.empty(), "Unfulfilled initialization list");
   return descs;
 }

 template <>
 details::OutputReshapeInfo OnnxifiOp<CPUContext>::initOutputReshapeInfo()
     const {
   details::OutputReshapeInfo output_reshape_info;
   output_reshape_info.begins.reserve(output_names_.size());
   output_reshape_info.ends.reserve(output_names_.size());
   output_reshape_info.fast_path.reserve(output_names_.size());
   for (int i = 0; i < output_names_.size(); ++i) {
     const auto it = output_shape_hints_.find(i);
     CAFFE_ENFORCE(
         it != output_shape_hints_.end(),
         "Cannot find output shape hints for ",
         output_names_[i]);
     int64_t num_dims = it->second.dims.size();
     // Initialize the tensors used to slice the output
     output_reshape_info.begins.emplace_back();
     ReinitializeTensor(
         &output_reshape_info.begins.back(),
         {num_dims},
         at::dtype<int32_t>().device(CPU));
     output_reshape_info.ends.emplace_back();
     ReinitializeTensor(
         &output_reshape_info.ends.back(),
         {num_dims},
         at::dtype<int32_t>().device(CPU));
   }
   return output_reshape_info;
 }

 template <>
 int OnnxifiOp<CPUContext>::extractOutputBatchSizes() {
   if (use_onnx_ || !adjust_output_batch_) {
     return max_batch_size_;
   }

   // Get the real batch size from nominal input. If it's equal to
   // max_batch_size, mark that we don't need to adjust batch size and return.
   // Otherwise, do a pass of shape inference to get the real shapes of the
   // outputs.
   const Tensor* t = nullptr;
   if (this->template InputIsType<int8::Int8TensorCPU>(nominal_batch_idx_)) {
     const auto& input_tensor_int8 =
         this->template Input<int8::Int8TensorCPU>(nominal_batch_idx_);
     t = &input_tensor_int8.t;
   } else {
     t = &Input(nominal_batch_idx_);
   }

   CAFFE_ENFORCE(
       t, "Null input shape tensor ptr. Possibly unsupported tensor type");
   CAFFE_ENFORCE(
       !t->sizes().empty(),
       input_names_[nominal_batch_idx_],
       " cannot be empty");
   const auto dims = t->sizes();
   const int current_batch_size = dims[0];
   if (current_batch_size == max_batch_size_) {
     return max_batch_size_;
   }

   // We still need to adjust output size but we can skip the shape inference as
   // it was done before.
   if (output_reshape_info_.count(current_batch_size)) {
     return current_batch_size;
   }

   auto it =
       output_reshape_info_.emplace(current_batch_size, initOutputReshapeInfo());
   auto& output_reshape_info = it.first->second;
   BoundShapeSpec spec(dims[0], max_seq_size_);
   auto bound_shape_inferencer =
       BoundShapeInferencerRegistry()->Create("C10", spec);
   for (int i = 0; i < InputSize(); ++i) {
     at::IntArrayRef dim0;
     bool quantized = false;
     if (this->template InputIsType<int8::Int8TensorCPU>(i)) {
       const auto& input_tensor_int8 =
           this->template Input<int8::Int8TensorCPU>(i);
       const auto& t0 = input_tensor_int8.t;
       dim0 = t0.sizes();
       quantized = true;
     } else {
       const auto& t0 = Input(i);
       dim0 = t0.sizes();
     }
     TensorShape shape;
     for (const auto d : dim0) {
       shape.add_dims(d);
     }
     std::vector<TensorBoundShape::DimType> dim_type(
         shape.dims_size(), TensorBoundShape_DimType_CONSTANT);
     if (dim_type.size()) {
       dim_type[0] = TensorBoundShape_DimType_BATCH;
     }
     input_shape_info_[input_names_[i]] =
         ShapeInfo(dim_type, std::move(shape), quantized);
   }
   bound_shape_inferencer->InferBoundShapeAndType(
       netdef_, input_shape_info_, nullptr, false);
   const auto& shape_info = bound_shape_inferencer->shape_info();
   for (int i = 0; i < OutputSize(); ++i) {
     const auto it = shape_info.find(output_names_[i]);
     CAFFE_ENFORCE(it != shape_info.end());
     const auto& real_shape = it->second.shape;
     const auto& max_shape = output_shapes_[i];
     CAFFE_ENFORCE_EQ(real_shape.dims_size(), max_shape.size());
     const auto dim_size = real_shape.dims_size();
     auto& begin = output_reshape_info.begins[i];
     begin.Resize(dim_size);
     int32_t* begin_ptr = begin.template mutable_data<int32_t>();
     auto& end = output_reshape_info.ends[i];
     end.Resize(dim_size);
     int32_t* end_ptr = end.template mutable_data<int32_t>();
     int32_t mismatch = 0;
     for (int j = 0; j < dim_size; ++j) {
       CAFFE_ENFORCE_GE(
           max_shape[j],
           real_shape.dims(j),
           "It is weird that max shape of ",
           output_names_[i],
           " is smaller than real shape at dim ",
           j,
           " (",
           max_shape[j],
           " vs ",
           real_shape.dims(j),
           ")");
       begin_ptr[j] = 0;
       if (max_shape[j] >= real_shape.dims(j)) {
         end_ptr[j] = real_shape.dims(j);
         mismatch += j;
       } else {
         end_ptr[j] = -1;
       }
     }
     output_reshape_info.fast_path[i] = !mismatch;
   }
   return current_batch_size;
 }

 template <>
 void OnnxifiOp<CPUContext>::adjustOutputBatchSizes(int current_batch_size) {
   auto it = output_reshape_info_.find(current_batch_size);
   CAFFE_ENFORCE(
       it != output_reshape_info_.end(),
       "Cannot find current_batch_size ",
       current_batch_size,
       " in output_reshape_info_");
   const auto& output_reshape_info = it->second;
   CPUContext context;
   Tensor tmp(CPU);
   for (int i = 0; i < OutputSize(); ++i) {
     Tensor* output_tensor = quantized_outputs_[i]
         ? (&this->template Output<int8::Int8TensorCPU>(i)->t)
         : Output(i);
     const auto& end = output_reshape_info.ends[i];
     if (output_reshape_info.fast_path[i]) {
       output_tensor->ShrinkTo(end.data<int32_t>()[0]);
     } else {
       // We need to use generic Slice
       SliceImpl<int32_t, CPUContext>(
           &tmp, *output_tensor, output_reshape_info.begins[i], end, &context);
       output_tensor->CopyFrom(tmp);
     }
   }
 }

 template <>
 void OnnxifiOp<CPUContext>::setOutputShapeAndType(int output_idx) {
   tensor_dims_int64_.clear();
   std::vector<size_t> tensor_dims;
   uint64_t type = ONNXIFI_DATATYPE_FLOAT32;
   const auto it = output_shape_hints_.find(output_idx);
   CAFFE_ENFORCE(
       it != output_shape_hints_.end(),
       "Cannot find shape hint for output: ",
       output_names_[output_idx]);
   const auto& info = it->second;
   std::copy(
       info.dims.begin(), info.dims.end(), std::back_inserter(tensor_dims));
   type = it->second.onnxifi_type;
   auto& tensor_descriptor = output_desc_[output_idx];
   tensor_descriptor.tag = ONNXIFI_TAG_TENSOR_DESCRIPTOR_V1;
   tensor_descriptor.memoryType = ONNXIFI_MEMORY_TYPE_CPU;
   tensor_descriptor.dimensions = tensor_dims.size();
   CAFFE_ENFORCE(
       tensor_descriptor.dimensions != 0, tensor_descriptor.name, " has 0 dim");
   auto& output_shape = output_shapes_[output_idx];
   output_shape.clear();
   output_shape.insert(
       output_shape.begin(), tensor_dims.cbegin(), tensor_dims.cend());
   tensor_descriptor.shape = output_shape.data();
   std::copy(
       tensor_dims.cbegin(),
       tensor_dims.cend(),
       std::back_inserter(tensor_dims_int64_));

   // Setup the output C2 tensor
   if (!info.quantized) {
     // Normal Tensor
     auto* output_tensor = Output(
         output_idx,
         tensor_dims_int64_,
         at::dtype(OnnxifiTypeToDataType(type)).device(CPU));
     setOutputTensorDescriptorTypeAndBuffer(
         type, output_tensor, &tensor_descriptor);
   } else if (info.quantizationParams == 1) {
     // single quantizer, output Int8Tensor
     auto* output_tensor =
         this->template Output<int8::Int8TensorCPU>(output_idx);
     output_tensor->t.Resize(tensor_dims_int64_);
     setOutputTensorDescriptorTypeAndBuffer(
         type, &output_tensor->t, &tensor_descriptor);
     tensor_descriptor.quantizationParams = 1;
     tensor_descriptor.quantizationAxis = 1;
     tensor_descriptor.scales = &output_tensor->scale;
     tensor_descriptor.biases = &output_tensor->zero_point;
   } else {
     CAFFE_THROW(
         "OnnxifiOp does not support output tensor with multi-quantization params: ",
         output_names_[output_idx]);
   }
 }

 template <>
 bool OnnxifiOp<CPUContext>::RunOnDevice() {
   CAFFE_ENFORCE_EQ(input_desc_.size(), InputSize());
   for (unsigned i = 0U; i < InputSize(); ++i) {
     auto& tensor_descriptor = input_desc_[i];
     tensor_descriptor.tag = ONNXIFI_TAG_TENSOR_DESCRIPTOR_V1;
     tensor_descriptor.memoryType = ONNXIFI_MEMORY_TYPE_CPU;
     at::IntArrayRef tensor_dims;
     if (this->template InputIsType<int8::Int8TensorCPU>(i)) {
       const auto& input_tensor_int8 =
           this->template Input<int8::Int8TensorCPU>(i);
       const auto& cpu_tensor = input_tensor_int8.t;
       tensor_dims = cpu_tensor.sizes();
       setInputTensorDescriptorTypeAndBuffer(
           input_tensor_int8, &tensor_descriptor);
     } else {
       const auto& input_tensor = Input(i);
       tensor_dims = input_tensor.sizes();
       setInputTensorDescriptorTypeAndBuffer(input_tensor, &tensor_descriptor);
     }
     auto& input_shape = input_shapes_[i];
     input_shape.clear();
     input_shape.insert(
         input_shape.begin(), tensor_dims.cbegin(), tensor_dims.cend());
     tensor_descriptor.dimensions = tensor_dims.size();
     tensor_descriptor.shape = input_shape.data();
   }

   CAFFE_ENFORCE_EQ(output_desc_.size(), OutputSize());
   for (unsigned i = 0U; i < OutputSize(); ++i) {
     setOutputShapeAndType(i);
   }
   bool ext_supported = false;
   onnxMemoryFenceV1 input_fence;
   onnxMemoryFenceV1 output_fence;
   std::vector<int> output_batch_sizes;
   int current_batch_size = max_batch_size_;
 #ifdef ONNXIFI_ENABLE_EXT
   /**
    * If onnxifi extension mode is enabled,
    * and onnxSetIOAndRunGraph is supported in backend,
    * then we run throw this workflow;
    * Else we fallback to non-onnxifi-extension workflow.
    **/
   if (onnxSetIOAndRunGraphPointer_ != nullptr) {
     ext_supported = true;
     output_fence.tag = ONNXIFI_TAG_MEMORY_FENCE_V1;
     output_fence.type = ONNXIFI_SYNCHRONIZATION_EVENT;
     traces_.reset();
     if (enable_tracing_) {
       traces_ = std::shared_ptr<onnxTraceEventList>(
           new onnxTraceEventList(), [this](onnxTraceEventList* p) {
             if (p && onnxReleaseTraceEventsPointer_) {
               CAFFE_ENFORCE_EQ(
                   (*onnxReleaseTraceEventsPointer_)(p), ONNXIFI_STATUS_SUCCESS);
             }
             delete p;
           });
       traces_->numEvents = 0;
     }
     CAFFE_ENFORCE_EQ(
         (*onnxSetIOAndRunGraphPointer_)(
             graph_,
             input_desc_.size(),
             input_desc_.data(),
             output_desc_.size(),
             output_desc_.data(),
             &output_fence,
             traces_.get()),
         ONNXIFI_STATUS_SUCCESS);
     current_batch_size = extractOutputBatchSizes();
     onnxEventState eventState;
     onnxStatus eventStatus;
     std::string message;
     size_t messageLength = 512;
     message.resize(messageLength);

     CAFFE_ENFORCE_EQ(
         (*onnxWaitEventForPointer_)(
             output_fence.event,
             timeout_,
             &eventState,
             &eventStatus,
             const_cast<char*>(message.data()),
             &messageLength),
         ONNXIFI_STATUS_SUCCESS);
     CAFFE_ENFORCE_EQ(
         eventState,
         ONNXIFI_EVENT_STATE_SIGNALLED,
         "Onnxifi run timeouted out after ",
         timeout_,
         " ms.");
     if (eventStatus != ONNXIFI_STATUS_SUCCESS) {
       if (messageLength == 0) {
         CAFFE_THROW("onnxifi internal error");
       } else {
         CAFFE_THROW(message);
       }
     }
     CAFFE_ENFORCE_EQ(
         lib_->onnxReleaseEvent(output_fence.event), ONNXIFI_STATUS_SUCCESS);
   }
 #endif
   if (!ext_supported) {
     CAFFE_ENFORCE_EQ(
         lib_->onnxSetGraphIO(
             graph_,
             input_desc_.size(),
             input_desc_.data(),
             output_desc_.size(),
             output_desc_.data()),
         ONNXIFI_STATUS_SUCCESS);

     input_fence.tag = ONNXIFI_TAG_MEMORY_FENCE_V1;
     input_fence.type = ONNXIFI_SYNCHRONIZATION_EVENT;
     CAFFE_ENFORCE_EQ(
         lib_->onnxInitEvent(backend_, &input_fence.event),
         ONNXIFI_STATUS_SUCCESS);
     output_fence.tag = ONNXIFI_TAG_MEMORY_FENCE_V1;
     output_fence.type = ONNXIFI_SYNCHRONIZATION_EVENT;

     // Call the async run on backend, signal event on input fence and wait for
     // the event on output fence
     CAFFE_ENFORCE_EQ(
         lib_->onnxRunGraph(graph_, &input_fence, &output_fence),
         ONNXIFI_STATUS_SUCCESS);
     CAFFE_ENFORCE_EQ(
         lib_->onnxSignalEvent(input_fence.event), ONNXIFI_STATUS_SUCCESS);
     current_batch_size = extractOutputBatchSizes();
     CAFFE_ENFORCE_EQ(
         lib_->onnxWaitEvent(output_fence.event), ONNXIFI_STATUS_SUCCESS);

     // Destroy the event objects
     CAFFE_ENFORCE_EQ(
         lib_->onnxReleaseEvent(input_fence.event), ONNXIFI_STATUS_SUCCESS);
     CAFFE_ENFORCE_EQ(
         lib_->onnxReleaseEvent(output_fence.event), ONNXIFI_STATUS_SUCCESS);
   }

   if (adjust_quantized_offset_) {
     for (unsigned i = 0U; i < OutputSize(); ++i) {
       if (quantized_outputs_[i]) {
         auto* int8_tensor = this->template Output<int8::Int8TensorCPU>(i);
         int8_tensor->zero_point += adjust_quantized_offset_;
         adjustQuantizedOffset(&int8_tensor->t, adjust_quantized_offset_);
       }
     }
   }

   if (adjust_output_batch_ && current_batch_size != max_batch_size_) {
     adjustOutputBatchSizes(current_batch_size);
   }
   enable_tracing_ = false;
   return true;
 }

 REGISTER_CPU_OPERATOR(Onnxifi, OnnxifiOp<CPUContext>);
 OPERATOR_SCHEMA(Onnxifi)
     .NumInputs(0, INT_MAX)
     .NumOutputs(0, INT_MAX)
     .SetDoc(R"DOC(
     The Onnxifi operator is a black-box operator to lower the computation to Onnxifi backend
     )DOC")
     .Arg(
         "onnx_model",
         "(string default=\"\") Serialized ONNX model to be converted to backend representation")
     .Arg(
         "initializers",
         "Initialization pair indicating the mapping of the name between NetDef and ONNX model")
     .Arg(
         "output_resize_hints",
         "A list of key/value pairs indicating which input index to look up for real batch size for the given max output batch size");
 } // namespace caffe2