caffe2/opt/onnxifi_op.h - platform/external/pytorch - Git at Google

 #pragma once

 #include <unordered_map>

 #include "onnx/onnx_pb.h"

 #include <c10/util/Exception.h>
 #include <c10/util/SmallVector.h>
 #include <c10/util/irange.h>
 #include "caffe2/core/context.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/onnx/onnxifi_graph_info.h"
 #include "caffe2/onnx/onnxifi_init.h"
 #include "caffe2/opt/shape_info.h"
 #include "caffe2/utils/proto_utils.h"
 #include "caffe2/utils/string_utils.h"

 namespace caffe2 {
 namespace details {

 /// Provides slicing info for the outputs. All the vector members should be of
 /// the same size as number of outputs of the Onnxifi op.
 struct OutputReshapeInfo {
   std::vector<Tensor> begins;
   std::vector<Tensor> ends;
   std::vector<bool> fast_path;
 };

 struct TensorInfo {
   std::vector<uint64_t> dims;
   uint64_t onnxifi_type;
   bool quantized;
   uint32_t quantizationAxis;
   uint64_t quantizationParams;
   std::vector<float> scales;
   std::vector<int32_t> biases;
   explicit TensorInfo(const TensorProto& t);
   explicit TensorInfo(const QTensorProto& t);
   TensorInfo(TensorInfo&&) = default;
   TensorInfo& operator=(TensorInfo&&) = default;
 };
 } // namespace details

 template <typename Context>
 class OnnxifiOp final : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   explicit OnnxifiOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<Context>(operator_def, ws),
         use_onnx_(this->template GetSingleArgument<int>("use_onnx", 0)),
         use_glow_aot_(this->template GetSingleArgument<int>("use_glow_aot", 0)),
         max_batch_size_(
             this->template GetSingleArgument<int>("max_batch_size", 0)),
         max_seq_size_(this->template GetSingleArgument<int>("max_seq_size", 0)),
         timeout_(this->template GetSingleArgument<int>("timeout", 0)),
         nominal_batch_idx_(
             this->template GetSingleArgument<int>("nominal_batch_idx", 0)),
         use_passed_output_shapes_(this->template GetSingleArgument<int>("use_passed_output_shapes", 0)),
         adjust_quantized_offset_(this->template GetSingleArgument<int>(
             "adjust_quantized_offset",
             128)),
         use_onnxifi_batch_size_(this->template GetSingleArgument<int>(
             "use_onnxifi_batch_size",
             0)) {
     lib_ = onnx::initOnnxifiLibrary();
     backend_graph_map_ptr_ = onnx::getOnnxBackendGraphMap();
     CAFFE_ENFORCE(lib_, "Cannot initialize ONNXIFI library");
     auto onnx_model_str =
         this->template GetSingleArgument<std::string>("onnx_model", "");
     CAFFE_ENFORCE(!onnx_model_str.empty(), "onnx_model cannot be empty");
     if (use_glow_aot_) {
       auto netdef_str =
           this->template GetSingleArgument<std::string>("netdef_str", "");
       CAFFE_ENFORCE(ParseProtoFromLargeString(netdef_str, &netdef_));
     } else if (!use_onnx_) {
       CAFFE_ENFORCE(ParseProtoFromLargeString(onnx_model_str, &netdef_));
     }

     // Setup input/output descriptor templates
     input_names_ =
         this->template GetRepeatedArgument<std::string>("input_names");
     output_names_ =
         this->template GetRepeatedArgument<std::string>("output_names");
     CAFFE_ENFORCE_EQ(input_names_.size(), operator_def.input_size());
     CAFFE_ENFORCE_EQ(output_names_.size(), operator_def.output_size());
     for (const auto& input : input_names_) {
       input_desc_.push_back(onnxTensorDescriptorV1());
       input_desc_.back().name = input.c_str();
     }
     all_offsets_.reserve(ws->Blobs().size());
     all_scales_.reserve(ws->Blobs().size());
     input_shapes_.resize(input_names_.size());
     output_shapes_max_bs_.resize(output_names_.size());
     quantized_outputs_.resize(output_names_.size(), false);
     int output_idx = 0;
     ArgumentHelper helper(operator_def);
     auto output_shape_info =
         helper.GetRepeatedArgument<TensorProto>("output_shape_info");
     auto output_qshape_info =
         helper.GetRepeatedArgument<QTensorProto>("output_qshape_info");
     std::unordered_map<std::string, TensorProto> output_shape_map;
     for (const auto& info : output_shape_info) {
       output_shape_map.emplace(info.name(), info);
     }
     std::unordered_map<std::string, QTensorProto> output_qshape_map;
     for (const auto& info : output_qshape_info) {
       output_qshape_map.emplace(info.name(), info);
     }
     bool has_quantized_output = false;
     for (const auto& output : output_names_) {
       output_desc_.push_back(onnxTensorDescriptorV1());
       output_desc_.back().name = output.c_str();

       // For output, we try to get its output size hint
       const auto it = output_shape_map.find(output);
       if (it != output_shape_map.end()) {
         output_shape_hints_.emplace(
             output_idx, details::TensorInfo(it->second));
       } else {
         const auto qit = output_qshape_map.find(output);
         if (qit != output_qshape_map.end()) {
           output_shape_hints_.emplace(
               output_idx, details::TensorInfo(qit->second));
           quantized_outputs_[output_idx] = true;
           has_quantized_output = true;
         }
       }
       ++output_idx;
     }
     if (!has_quantized_output) {
       adjust_quantized_offset_ = 0;
     }

     LOG(INFO) << "use_onnx_=" << use_onnx_
         << ", use_glow_aot_=" << use_glow_aot_
         << ", use_passed_output_shapes_=" << use_passed_output_shapes_
         << ", use_onnxifi_batch_size_=" << use_onnxifi_batch_size_;

     if (use_passed_output_shapes_) {
       // Populate output_shapes_per_bs_
       for (const auto bs : c10::irange(1, max_batch_size_)) {
         auto output_shapes_tp = helper.GetRepeatedArgument<TensorProto>("output_shapes_bs_" + caffe2::to_string(bs));
         auto output_qshapes_tp = helper.GetRepeatedArgument<TensorProto>("output_qshapes_bs_" + caffe2::to_string(bs));
         CAFFE_ENFORCE_EQ(output_names_.size(), output_shapes_tp.size() + output_qshapes_tp.size());

         std::unordered_map<std::string, details::TensorInfo> name_to_shape;
         for (const auto& output_shape_tp : output_shapes_tp) {
           name_to_shape.emplace(output_shape_tp.name(), details::TensorInfo{output_shape_tp});
         }
         for (const auto& output_qshape_tp : output_qshapes_tp) {
           name_to_shape.emplace(output_qshape_tp.name(), details::TensorInfo{output_qshape_tp});
         }

         for (const auto& output : output_names_) {
           auto it = name_to_shape.find(output);
           CAFFE_ENFORCE(it != name_to_shape.end());
           output_shapes_per_bs_[bs].push_back({});
           auto &output_shapes = output_shapes_per_bs_[bs].back();
           std::copy(it->second.dims.cbegin(), it->second.dims.cend(), std::back_inserter(output_shapes));
         }
       }
     }

     // Get output resizing hints
     adjust_output_batch_ =
         this->template GetSingleArgument<int>("adjust_output_batch", 0);

     // Encode arguments starting with "custom_" to backend
     std::vector<uint64_t> property_pointers;
     std::vector<int64_t> int_args;
     std::vector<float> float_args;
     buildPropertyList(operator_def, &property_pointers, &int_args, &float_args);

     // Initialize the backend if it has not been already created. When we
     // initialized the backend, we will get the weights (initializers) from the
     // workspace and offload onto the backend. This should be done only once.
     // Subsequent call of this function with the same model id should find a
     // cached backend and therefore there is no need to repeat the above
     // process.
     buildBackendAndGraph(ws, property_pointers, onnx_model_str);
   }

   ~OnnxifiOp() {
     backend_graph_shared_ptr_.reset();
     backend_graph_map_ptr_->remove(op_id_string_);
 #ifdef ONNXIFI_ENABLE_EXT
     traces_.reset();
 #endif
   }

   bool RunOnDevice() override;

   void setEnableTracing(bool b) {
     enable_tracing_ = b;
   }

 #ifdef ONNXIFI_ENABLE_EXT
   std::shared_ptr<onnxTraceEventList> traces() const {
     return traces_;
   }
 #endif
  private:
   // Second argument is a cache vector to avoid repeated reallocation.
   // The existence of this is not ideal, which is purely due to the fact that
   // we use int64_t for c2::tensor dim but uint64_t for onnxDesciptor dim.
   // Maybe we should just use int64_t.
   void setOutputShapeAndType(
       int output_idx,
       c10::SmallVector<int64_t, 4>& tensor_dims_int64);

   void buildPropertyList(
       const OperatorDef& /* unused */,
       std::vector<uint64_t>* property_list,
       std::vector<int64_t>* /* unused */,
       std::vector<float>* /* unused */) {
     property_list->push_back(ONNXIFI_BACKEND_PROPERTY_NONE);
   }

   void buildBackendAndGraph(
       Workspace* ws,
       const std::vector<uint64_t>& property_pointers,
       const std::string& onnx_model_str) {
     op_id_string_ =
         this->template GetSingleArgument<std::string>("model_id", "") + ":" +
         this->template GetSingleArgument<std::string>("net_pos", "");

     auto initializers =
         this->template GetRepeatedArgument<std::string>("initializers");
     // Build the Onnxifi engine
     auto backend_index =
         this->template GetSingleArgument<int>("backend_id", use_onnx_ ? 1 : 0);
     // If using Glow AOT, override the backend_id to 1, since it uses a custom
     // ONNX format, and that's the id we use for the ONNX backend.
     if (use_glow_aot_) {
       backend_index = 1;
     }
     auto creator = [this,
                     ws,
                     property_pointers,
                     backend_index,
                     &onnx_model_str,
                     &initializers]() {
       std::vector<onnxBackendID> backend_ids;
       size_t num_backends{0};
       CAFFE_ENFORCE_EQ(
           lib_->onnxGetBackendIDs(nullptr, &num_backends),
           ONNXIFI_STATUS_FALLBACK);
       CAFFE_ENFORCE_GT(
           num_backends, 0, "At least 1 onnxifi backend should be available");
       CAFFE_ENFORCE_LT(
           backend_index,
           num_backends,
           "Backend idx out of bound: ",
           backend_index,
           ", #backends: ",
           num_backends);
       backend_ids.resize(num_backends);
       CAFFE_ENFORCE_EQ(
           lib_->onnxGetBackendIDs(backend_ids.data(), &num_backends),
           ONNXIFI_STATUS_SUCCESS);

       onnxBackendID backend_id = backend_ids[backend_index];
       onnxBackend backend{nullptr};

       CAFFE_ENFORCE_EQ(
           lib_->onnxInitBackend(backend_id, property_pointers.data(), &backend),
           ONNXIFI_STATUS_SUCCESS);

       // Release unused backend ids.
       for (const auto i : c10::irange(num_backends)) {
         if (i == static_cast<size_t>(backend_index)) {
           continue;
         }
         lib_->onnxReleaseBackendID(backend_ids[i]);
       }

       // Get weights
       std::vector<std::string> weight_names;
       std::vector<std::vector<uint64_t>> weight_shapes;
       auto weight_descs = buildInitializationList(
           ws,
           initializers,
           &weight_names,
           &weight_shapes,
           &all_scales_,
           &all_offsets_);

       // Extra weight shapes
       std::unordered_map<std::string, ShapeInfo> weight_shape_info;
       for (const auto i : c10::irange(weight_names.size())) {
         TensorShape shape;
         const auto& shape0 = weight_shapes[i];
         for (const auto d : shape0) {
           shape.add_dims(d);
         }
         weight_shape_info[weight_names[i]] = ShapeInfo(
             std::vector<TensorBoundShape::DimType>(
                 shape0.size(), TensorBoundShape_DimType_CONSTANT),
             std::move(shape));
       }

       Blob* defered_blob_reader = nullptr;
       if (ws->HasBlob("__DEFERRED_BLOB_READER__")) {
         defered_blob_reader = ws->GetBlob("__DEFERRED_BLOB_READER__");
       }
       onnxGraph graph{nullptr};

       static const uint64_t auxPropertiesListAOT[] = {
           ONNXIFI_OPTIMIZATION_AOT, ONNXIFI_GRAPH_PROPERTY_NONE};
       auto ret = lib_->onnxInitGraph(
           backend,
           use_glow_aot_ ? auxPropertiesListAOT : nullptr,
           onnx_model_str.size(),
           (const void*)(onnx_model_str.c_str()),
           weight_descs.size(),
           weight_descs.data(),
           &graph,
           static_cast<uint32_t>(max_seq_size_),
           defered_blob_reader);
       if (ret != ONNXIFI_STATUS_SUCCESS) {
         if (ret == ONNXIFI_STATUS_FATAL_ERROR) {
           C10_THROW_ERROR(
               OnnxfiBackendSystemError, "Fatal error during onnxInitGraph");
         } else {
           CAFFE_THROW("onnxInitGraph failed");
         }
       }

       return std::make_shared<onnx::BackendGraphInfo>(
           backend_id, backend, graph, lib_, std::move(weight_shape_info));
     };
     backend_graph_shared_ptr_ =
         backend_graph_map_ptr_->insert(op_id_string_, creator);

     backend_id_ = backend_graph_shared_ptr_->backend_id;
     backend_ = backend_graph_shared_ptr_->backend;
     graph_ = backend_graph_shared_ptr_->graph;
     input_shape_info_ = backend_graph_shared_ptr_->weight_shape_info;

     getExtFunctionPointers();
   }

   /// Set up function pointer if onnxifi_ext is enabled
   void getExtFunctionPointers() {
 #ifdef ONNXIFI_ENABLE_EXT
     union {
       onnxExtensionFunctionPointer p;
       decltype(onnxSetIOAndRunGraphPointer_) set;
       decltype(onnxReleaseTraceEventsPointer_) release;
       decltype(onnxWaitEventForPointer_) waitfor;
       decltype(onnxGetCurrentBatchSizePointer_) currentbatchsize;
     } u;
     if (lib_->onnxGetExtensionFunctionAddress(
             backend_id_, "onnxSetIOAndRunGraphFunction", &u.p) !=
         ONNXIFI_STATUS_SUCCESS) {
       onnxSetIOAndRunGraphPointer_ = nullptr;
     } else {
       onnxSetIOAndRunGraphPointer_ = u.set;
     }
     if (lib_->onnxGetExtensionFunctionAddress(
             backend_id_, "onnxReleaseTraceEventsFunction", &u.p) !=
         ONNXIFI_STATUS_SUCCESS) {
       onnxReleaseTraceEventsPointer_ = nullptr;
     } else {
       onnxReleaseTraceEventsPointer_ = u.release;
     }
     if (lib_->onnxGetExtensionFunctionAddress(
             backend_id_, "onnxWaitEventForFunction", &u.p) !=
         ONNXIFI_STATUS_SUCCESS) {
       onnxWaitEventForPointer_ = nullptr;
     } else {
       onnxWaitEventForPointer_ = u.waitfor;
     }
     if (lib_->onnxGetExtensionFunctionAddress(
             backend_id_, "onnxGetCurrentBatchSizeFunction", &u.p) !=
         ONNXIFI_STATUS_SUCCESS) {
       onnxWaitEventForPointer_ = nullptr;
     } else {
       onnxGetCurrentBatchSizePointer_ = u.currentbatchsize;
     }
 #endif
   }

   /// Helper method for extractOutputBatchSizes(), used to deduplicate code of populating output reshape infos
   template <typename DimContainer>
   void fillOutputReshapeInfo(
       const DimContainer& real_shape,
       c10::ArrayRef<uint64_t> max_shape,
       details::OutputReshapeInfo &output_reshape_info,
       int index);

   /// Helper method for updating output reshape info using provided output shape hints.
   void extractOutputBatchSizes(int current_batch_size);

   /// Extract output batch size. If the output batch size is going to be at
   /// max_batch_size_, return true indicating that no output shape adjustment is
   /// needed. Otherwise, return false.
   int extractOutputBatchSizes();

   /// Adjust output tensor shape based on the current input batch size.
   /// If the output shape is conditioned on first dim (batch size), we have a
   /// fast path to shrink the tensor shape by just manipulating the meta data.
   /// Otherwise, we have to slice it in the middle of the dimension with copy
   /// invoked. This is a slow path and we don't expect it to happen very often.
   /// We can already omit this step by setting "adjust_output_batch_" to false
   void adjustOutputBatchSizes(int current_batch_size);

   std::vector<onnxTensorDescriptorV1> buildInitializationList(
       Workspace* ws,
       const std::vector<std::string>& initializers,
       std::vector<std::string>* weight_names,
       std::vector<std::vector<uint64_t>>* weight_shapes,
       std::vector<std::vector<float>>* all_scales,
       std::vector<std::vector<int32_t>>* all_offsets) const;

   /// initialize an OutputReshapeInfo object
   details::OutputReshapeInfo initOutputReshapeInfo() const;

   // pointer to loaded onnxifi library
   onnxifi_library* lib_{nullptr};
   onnx::OnnxBackendGraphMap* backend_graph_map_ptr_;
   std::string op_id_string_;

   onnxBackendID backend_id_{nullptr};
   onnxBackend backend_{nullptr};
   onnxGraph graph_{nullptr};
   onnx::SharedPtrBackendGraphInfo backend_graph_shared_ptr_;

   // input/output descriptors
   std::vector<onnxTensorDescriptorV1> input_desc_;
   std::vector<onnxTensorDescriptorV1> output_desc_;

   // Output reshape info
   // It is a map keyed on batch size and the value OutputReshapeInfo for the
   // batch size.
   std::unordered_map<int, details::OutputReshapeInfo> output_reshape_info_;

 #ifdef ONNXIFI_ENABLE_EXT
   // onnxifi extension mode function pointer
   onnxStatus (*onnxSetIOAndRunGraphPointer_)(
       onnxGraph,
       uint32_t,
       const onnxTensorDescriptorV1*,
       uint32_t,
       const onnxTensorDescriptorV1*,
       onnxMemoryFenceV1*,
       onnxTraceEventList*);

   onnxStatus (*onnxReleaseTraceEventsPointer_)(onnxTraceEventList*);
   onnxStatus (*onnxWaitEventForPointer_)(
       onnxEvent event,
       uint32_t timeoutMs,
       onnxEventState* eventState,
       onnxStatus* eventStatus,
       char* message,
       size_t* messageLength);

   onnxStatus (*onnxGetCurrentBatchSizePointer_)(int64_t*);

   std::shared_ptr<onnxTraceEventList> traces_{nullptr};
 #endif

   // ONNX model or not
   bool use_onnx_{false};

   // Glow AOT model or not
   bool use_glow_aot_{false};

   // max batch size
   int max_batch_size_;

   // max sequence lookup size
   int max_seq_size_;

   // Inference timeout limits. Default 0 means no timeout.
   int timeout_;

   // index of the input whose first dimension represents the batch size
   int nominal_batch_idx_{0};

   // We bind the op input/output by position while ONNXIFI binds input/output by
   // names. In addition, op input/output names can be written by, for example,
   // memonger. We cache the original input/output name of ONNX object here and
   // bind them by position.
   std::vector<std::string> input_names_;
   std::vector<std::string> output_names_;

   // NetDef of the onnxifi subgraph for shape inference
   NetDef netdef_;

   std::vector<c10::SmallVector<uint64_t, 4>> input_shapes_;
   std::vector<c10::SmallVector<uint64_t, 4>> output_shapes_max_bs_;

   // Mapping of batch sizes to output shapes
   std::unordered_map<int, std::vector<c10::SmallVector<uint64_t, 4>>> output_shapes_per_bs_;

   // Indicate if i-th output is a quantized tensor
   std::vector<bool> quantized_outputs_;

   // This is for multi group quantization info
   std::vector<std::vector<float>> all_scales_;
   std::vector<std::vector<int32_t>> all_offsets_;

   // output shape hints
   std::unordered_map<int, details::TensorInfo> output_shape_hints_;

   // input shape info. Used by shape inference when inputs are not at
   // max_batch_size
   std::unordered_map<std::string, ShapeInfo> input_shape_info_;

   // Whether we should use passed output shape hints or do shape inference
   const bool use_passed_output_shapes_{false};

   // Whether we need to resize outputs or not
   bool adjust_output_batch_{false};

   // Whether we enable tracing in one run of inference
   bool enable_tracing_{false};

   // Adjust the quantized offset to compensate mismatch of certain backend
   uint8_t adjust_quantized_offset_{0};

   // Whether we should read batch size value from Onnxifi request data
   const bool use_onnxifi_batch_size_{false};
 };

 } // namespace caffe2
	#pragma once

	#include <unordered_map>

	#include "onnx/onnx_pb.h"

	#include <c10/util/Exception.h>
	#include <c10/util/SmallVector.h>
	#include <c10/util/irange.h>
	#include "caffe2/core/context.h"
	#include "caffe2/core/logging.h"
	#include "caffe2/core/operator.h"
	#include "caffe2/onnx/onnxifi_graph_info.h"
	#include "caffe2/onnx/onnxifi_init.h"
	#include "caffe2/opt/shape_info.h"
	#include "caffe2/utils/proto_utils.h"
	#include "caffe2/utils/string_utils.h"

	namespace caffe2 {
	namespace details {

	/// Provides slicing info for the outputs. All the vector members should be of
	/// the same size as number of outputs of the Onnxifi op.
	struct OutputReshapeInfo {
	std::vector<Tensor> begins;
	std::vector<Tensor> ends;
	std::vector<bool> fast_path;
	};

	struct TensorInfo {
	std::vector<uint64_t> dims;
	uint64_t onnxifi_type;
	bool quantized;
	uint32_t quantizationAxis;
	uint64_t quantizationParams;
	std::vector<float> scales;
	std::vector<int32_t> biases;
	explicit TensorInfo(const TensorProto& t);
	explicit TensorInfo(const QTensorProto& t);
	TensorInfo(TensorInfo&&) = default;
	TensorInfo& operator=(TensorInfo&&) = default;
	};
	} // namespace details

	template <typename Context>
	class OnnxifiOp final : public Operator<Context> {
	public:
	USE_OPERATOR_CONTEXT_FUNCTIONS;
	explicit OnnxifiOp(const OperatorDef& operator_def, Workspace* ws)
	: Operator<Context>(operator_def, ws),
	use_onnx_(this->template GetSingleArgument<int>("use_onnx", 0)),
	use_glow_aot_(this->template GetSingleArgument<int>("use_glow_aot", 0)),
	max_batch_size_(
	this->template GetSingleArgument<int>("max_batch_size", 0)),
	max_seq_size_(this->template GetSingleArgument<int>("max_seq_size", 0)),
	timeout_(this->template GetSingleArgument<int>("timeout", 0)),
	nominal_batch_idx_(
	this->template GetSingleArgument<int>("nominal_batch_idx", 0)),
	use_passed_output_shapes_(this->template GetSingleArgument<int>("use_passed_output_shapes", 0)),
	adjust_quantized_offset_(this->template GetSingleArgument<int>(
	"adjust_quantized_offset",
	128)),
	use_onnxifi_batch_size_(this->template GetSingleArgument<int>(
	"use_onnxifi_batch_size",
	0)) {
	lib_ = onnx::initOnnxifiLibrary();
	backend_graph_map_ptr_ = onnx::getOnnxBackendGraphMap();
	CAFFE_ENFORCE(lib_, "Cannot initialize ONNXIFI library");
	auto onnx_model_str =
	this->template GetSingleArgument<std::string>("onnx_model", "");
	CAFFE_ENFORCE(!onnx_model_str.empty(), "onnx_model cannot be empty");
	if (use_glow_aot_) {
	auto netdef_str =
	this->template GetSingleArgument<std::string>("netdef_str", "");
	CAFFE_ENFORCE(ParseProtoFromLargeString(netdef_str, &netdef_));
	} else if (!use_onnx_) {
	CAFFE_ENFORCE(ParseProtoFromLargeString(onnx_model_str, &netdef_));
	}

	// Setup input/output descriptor templates
	input_names_ =
	this->template GetRepeatedArgument<std::string>("input_names");
	output_names_ =
	this->template GetRepeatedArgument<std::string>("output_names");
	CAFFE_ENFORCE_EQ(input_names_.size(), operator_def.input_size());
	CAFFE_ENFORCE_EQ(output_names_.size(), operator_def.output_size());
	for (const auto& input : input_names_) {
	input_desc_.push_back(onnxTensorDescriptorV1());
	input_desc_.back().name = input.c_str();
	}
	all_offsets_.reserve(ws->Blobs().size());
	all_scales_.reserve(ws->Blobs().size());
	input_shapes_.resize(input_names_.size());
	output_shapes_max_bs_.resize(output_names_.size());
	quantized_outputs_.resize(output_names_.size(), false);
	int output_idx = 0;
	ArgumentHelper helper(operator_def);
	auto output_shape_info =
	helper.GetRepeatedArgument<TensorProto>("output_shape_info");
	auto output_qshape_info =
	helper.GetRepeatedArgument<QTensorProto>("output_qshape_info");
	std::unordered_map<std::string, TensorProto> output_shape_map;
	for (const auto& info : output_shape_info) {
	output_shape_map.emplace(info.name(), info);
	}
	std::unordered_map<std::string, QTensorProto> output_qshape_map;
	for (const auto& info : output_qshape_info) {
	output_qshape_map.emplace(info.name(), info);
	}
	bool has_quantized_output = false;
	for (const auto& output : output_names_) {
	output_desc_.push_back(onnxTensorDescriptorV1());
	output_desc_.back().name = output.c_str();

	// For output, we try to get its output size hint
	const auto it = output_shape_map.find(output);
	if (it != output_shape_map.end()) {
	output_shape_hints_.emplace(
	output_idx, details::TensorInfo(it->second));
	} else {
	const auto qit = output_qshape_map.find(output);
	if (qit != output_qshape_map.end()) {
	output_shape_hints_.emplace(
	output_idx, details::TensorInfo(qit->second));
	quantized_outputs_[output_idx] = true;
	has_quantized_output = true;
	}
	}
	++output_idx;
	}
	if (!has_quantized_output) {
	adjust_quantized_offset_ = 0;
	}

	LOG(INFO) << "use_onnx_=" << use_onnx_
	<< ", use_glow_aot_=" << use_glow_aot_
	<< ", use_passed_output_shapes_=" << use_passed_output_shapes_
	<< ", use_onnxifi_batch_size_=" << use_onnxifi_batch_size_;

	if (use_passed_output_shapes_) {
	// Populate output_shapes_per_bs_
	for (const auto bs : c10::irange(1, max_batch_size_)) {
	auto output_shapes_tp = helper.GetRepeatedArgument<TensorProto>("output_shapes_bs_" + caffe2::to_string(bs));
	auto output_qshapes_tp = helper.GetRepeatedArgument<TensorProto>("output_qshapes_bs_" + caffe2::to_string(bs));
	CAFFE_ENFORCE_EQ(output_names_.size(), output_shapes_tp.size() + output_qshapes_tp.size());

	std::unordered_map<std::string, details::TensorInfo> name_to_shape;
	for (const auto& output_shape_tp : output_shapes_tp) {
	name_to_shape.emplace(output_shape_tp.name(), details::TensorInfo{output_shape_tp});
	}
	for (const auto& output_qshape_tp : output_qshapes_tp) {
	name_to_shape.emplace(output_qshape_tp.name(), details::TensorInfo{output_qshape_tp});
	}

	for (const auto& output : output_names_) {
	auto it = name_to_shape.find(output);
	CAFFE_ENFORCE(it != name_to_shape.end());
	output_shapes_per_bs_[bs].push_back({});
	auto &output_shapes = output_shapes_per_bs_[bs].back();
	std::copy(it->second.dims.cbegin(), it->second.dims.cend(), std::back_inserter(output_shapes));
	}
	}
	}

	// Get output resizing hints
	adjust_output_batch_ =
	this->template GetSingleArgument<int>("adjust_output_batch", 0);

	// Encode arguments starting with "custom_" to backend
	std::vector<uint64_t> property_pointers;
	std::vector<int64_t> int_args;
	std::vector<float> float_args;
	buildPropertyList(operator_def, &property_pointers, &int_args, &float_args);

	// Initialize the backend if it has not been already created. When we
	// initialized the backend, we will get the weights (initializers) from the
	// workspace and offload onto the backend. This should be done only once.
	// Subsequent call of this function with the same model id should find a
	// cached backend and therefore there is no need to repeat the above
	// process.
	buildBackendAndGraph(ws, property_pointers, onnx_model_str);
	}

	~OnnxifiOp() {
	backend_graph_shared_ptr_.reset();
	backend_graph_map_ptr_->remove(op_id_string_);
	#ifdef ONNXIFI_ENABLE_EXT
	traces_.reset();
	#endif
	}

	bool RunOnDevice() override;

	void setEnableTracing(bool b) {
	enable_tracing_ = b;
	}

	#ifdef ONNXIFI_ENABLE_EXT
	std::shared_ptr<onnxTraceEventList> traces() const {
	return traces_;
	}
	#endif
	private:
	// Second argument is a cache vector to avoid repeated reallocation.
	// The existence of this is not ideal, which is purely due to the fact that
	// we use int64_t for c2::tensor dim but uint64_t for onnxDesciptor dim.
	// Maybe we should just use int64_t.
	void setOutputShapeAndType(
	int output_idx,
	c10::SmallVector<int64_t, 4>& tensor_dims_int64);

	void buildPropertyList(
	const OperatorDef& /* unused */,
	std::vector<uint64_t>* property_list,
	std::vector<int64_t>* /* unused */,
	std::vector<float>* /* unused */) {
	property_list->push_back(ONNXIFI_BACKEND_PROPERTY_NONE);
	}

	void buildBackendAndGraph(
	Workspace* ws,
	const std::vector<uint64_t>& property_pointers,
	const std::string& onnx_model_str) {
	op_id_string_ =
	this->template GetSingleArgument<std::string>("model_id", "") + ":" +
	this->template GetSingleArgument<std::string>("net_pos", "");

	auto initializers =
	this->template GetRepeatedArgument<std::string>("initializers");
	// Build the Onnxifi engine
	auto backend_index =
	this->template GetSingleArgument<int>("backend_id", use_onnx_ ? 1 : 0);
	// If using Glow AOT, override the backend_id to 1, since it uses a custom
	// ONNX format, and that's the id we use for the ONNX backend.
	if (use_glow_aot_) {
	backend_index = 1;
	}
	auto creator = [this,
	ws,
	property_pointers,
	backend_index,
	&onnx_model_str,
	&initializers]() {
	std::vector<onnxBackendID> backend_ids;
	size_t num_backends{0};
	CAFFE_ENFORCE_EQ(
	lib_->onnxGetBackendIDs(nullptr, &num_backends),
	ONNXIFI_STATUS_FALLBACK);
	CAFFE_ENFORCE_GT(
	num_backends, 0, "At least 1 onnxifi backend should be available");
	CAFFE_ENFORCE_LT(
	backend_index,
	num_backends,
	"Backend idx out of bound: ",
	backend_index,
	", #backends: ",
	num_backends);
	backend_ids.resize(num_backends);
	CAFFE_ENFORCE_EQ(
	lib_->onnxGetBackendIDs(backend_ids.data(), &num_backends),
	ONNXIFI_STATUS_SUCCESS);

	onnxBackendID backend_id = backend_ids[backend_index];
	onnxBackend backend{nullptr};

	CAFFE_ENFORCE_EQ(
	lib_->onnxInitBackend(backend_id, property_pointers.data(), &backend),
	ONNXIFI_STATUS_SUCCESS);

	// Release unused backend ids.
	for (const auto i : c10::irange(num_backends)) {
	if (i == static_cast<size_t>(backend_index)) {
	continue;
	}
	lib_->onnxReleaseBackendID(backend_ids[i]);
	}

	// Get weights
	std::vector<std::string> weight_names;
	std::vector<std::vector<uint64_t>> weight_shapes;
	auto weight_descs = buildInitializationList(
	ws,
	initializers,
	&weight_names,
	&weight_shapes,
	&all_scales_,
	&all_offsets_);

	// Extra weight shapes
	std::unordered_map<std::string, ShapeInfo> weight_shape_info;
	for (const auto i : c10::irange(weight_names.size())) {
	TensorShape shape;
	const auto& shape0 = weight_shapes[i];
	for (const auto d : shape0) {
	shape.add_dims(d);
	}
	weight_shape_info[weight_names[i]] = ShapeInfo(
	std::vector<TensorBoundShape::DimType>(
	shape0.size(), TensorBoundShape_DimType_CONSTANT),
	std::move(shape));
	}

	Blob* defered_blob_reader = nullptr;
	if (ws->HasBlob("__DEFERRED_BLOB_READER__")) {
	defered_blob_reader = ws->GetBlob("__DEFERRED_BLOB_READER__");
	}
	onnxGraph graph{nullptr};

	static const uint64_t auxPropertiesListAOT[] = {
	ONNXIFI_OPTIMIZATION_AOT, ONNXIFI_GRAPH_PROPERTY_NONE};
	auto ret = lib_->onnxInitGraph(
	backend,
	use_glow_aot_ ? auxPropertiesListAOT : nullptr,
	onnx_model_str.size(),
	(const void*)(onnx_model_str.c_str()),
	weight_descs.size(),
	weight_descs.data(),
	&graph,
	static_cast<uint32_t>(max_seq_size_),
	defered_blob_reader);
	if (ret != ONNXIFI_STATUS_SUCCESS) {
	if (ret == ONNXIFI_STATUS_FATAL_ERROR) {
	C10_THROW_ERROR(
	OnnxfiBackendSystemError, "Fatal error during onnxInitGraph");
	} else {
	CAFFE_THROW("onnxInitGraph failed");
	}
	}

	return std::make_shared<onnx::BackendGraphInfo>(
	backend_id, backend, graph, lib_, std::move(weight_shape_info));
	};
	backend_graph_shared_ptr_ =
	backend_graph_map_ptr_->insert(op_id_string_, creator);

	backend_id_ = backend_graph_shared_ptr_->backend_id;
	backend_ = backend_graph_shared_ptr_->backend;
	graph_ = backend_graph_shared_ptr_->graph;
	input_shape_info_ = backend_graph_shared_ptr_->weight_shape_info;

	getExtFunctionPointers();
	}

	/// Set up function pointer if onnxifi_ext is enabled
	void getExtFunctionPointers() {
	#ifdef ONNXIFI_ENABLE_EXT
	union {
	onnxExtensionFunctionPointer p;
	decltype(onnxSetIOAndRunGraphPointer_) set;
	decltype(onnxReleaseTraceEventsPointer_) release;
	decltype(onnxWaitEventForPointer_) waitfor;
	decltype(onnxGetCurrentBatchSizePointer_) currentbatchsize;
	} u;
	if (lib_->onnxGetExtensionFunctionAddress(
	backend_id_, "onnxSetIOAndRunGraphFunction", &u.p) !=
	ONNXIFI_STATUS_SUCCESS) {
	onnxSetIOAndRunGraphPointer_ = nullptr;
	} else {
	onnxSetIOAndRunGraphPointer_ = u.set;
	}
	if (lib_->onnxGetExtensionFunctionAddress(
	backend_id_, "onnxReleaseTraceEventsFunction", &u.p) !=
	ONNXIFI_STATUS_SUCCESS) {
	onnxReleaseTraceEventsPointer_ = nullptr;
	} else {
	onnxReleaseTraceEventsPointer_ = u.release;
	}
	if (lib_->onnxGetExtensionFunctionAddress(
	backend_id_, "onnxWaitEventForFunction", &u.p) !=
	ONNXIFI_STATUS_SUCCESS) {
	onnxWaitEventForPointer_ = nullptr;
	} else {
	onnxWaitEventForPointer_ = u.waitfor;
	}
	if (lib_->onnxGetExtensionFunctionAddress(
	backend_id_, "onnxGetCurrentBatchSizeFunction", &u.p) !=
	ONNXIFI_STATUS_SUCCESS) {
	onnxWaitEventForPointer_ = nullptr;
	} else {
	onnxGetCurrentBatchSizePointer_ = u.currentbatchsize;
	}
	#endif
	}

	/// Helper method for extractOutputBatchSizes(), used to deduplicate code of populating output reshape infos
	template <typename DimContainer>
	void fillOutputReshapeInfo(
	const DimContainer& real_shape,
	c10::ArrayRef<uint64_t> max_shape,
	details::OutputReshapeInfo &output_reshape_info,
	int index);

	/// Helper method for updating output reshape info using provided output shape hints.
	void extractOutputBatchSizes(int current_batch_size);

	/// Extract output batch size. If the output batch size is going to be at
	/// max_batch_size_, return true indicating that no output shape adjustment is
	/// needed. Otherwise, return false.
	int extractOutputBatchSizes();

	/// Adjust output tensor shape based on the current input batch size.
	/// If the output shape is conditioned on first dim (batch size), we have a
	/// fast path to shrink the tensor shape by just manipulating the meta data.
	/// Otherwise, we have to slice it in the middle of the dimension with copy
	/// invoked. This is a slow path and we don't expect it to happen very often.
	/// We can already omit this step by setting "adjust_output_batch_" to false
	void adjustOutputBatchSizes(int current_batch_size);

	std::vector<onnxTensorDescriptorV1> buildInitializationList(
	Workspace* ws,
	const std::vector<std::string>& initializers,
	std::vector<std::string>* weight_names,
	std::vector<std::vector<uint64_t>>* weight_shapes,
	std::vector<std::vector<float>>* all_scales,
	std::vector<std::vector<int32_t>>* all_offsets) const;

	/// initialize an OutputReshapeInfo object
	details::OutputReshapeInfo initOutputReshapeInfo() const;

	// pointer to loaded onnxifi library
	onnxifi_library* lib_{nullptr};
	onnx::OnnxBackendGraphMap* backend_graph_map_ptr_;
	std::string op_id_string_;

	onnxBackendID backend_id_{nullptr};
	onnxBackend backend_{nullptr};
	onnxGraph graph_{nullptr};
	onnx::SharedPtrBackendGraphInfo backend_graph_shared_ptr_;

	// input/output descriptors
	std::vector<onnxTensorDescriptorV1> input_desc_;
	std::vector<onnxTensorDescriptorV1> output_desc_;

	// Output reshape info
	// It is a map keyed on batch size and the value OutputReshapeInfo for the
	// batch size.
	std::unordered_map<int, details::OutputReshapeInfo> output_reshape_info_;

	#ifdef ONNXIFI_ENABLE_EXT
	// onnxifi extension mode function pointer
	onnxStatus (*onnxSetIOAndRunGraphPointer_)(
	onnxGraph,
	uint32_t,
	const onnxTensorDescriptorV1*,
	uint32_t,
	const onnxTensorDescriptorV1*,
	onnxMemoryFenceV1*,
	onnxTraceEventList*);

	onnxStatus (onnxReleaseTraceEventsPointer_)(onnxTraceEventList);
	onnxStatus (*onnxWaitEventForPointer_)(
	onnxEvent event,
	uint32_t timeoutMs,
	onnxEventState* eventState,
	onnxStatus* eventStatus,
	char* message,
	size_t* messageLength);

	onnxStatus (onnxGetCurrentBatchSizePointer_)(int64_t);

	std::shared_ptr<onnxTraceEventList> traces_{nullptr};
	#endif

	// ONNX model or not
	bool use_onnx_{false};

	// Glow AOT model or not
	bool use_glow_aot_{false};

	// max batch size
	int max_batch_size_;

	// max sequence lookup size
	int max_seq_size_;

	// Inference timeout limits. Default 0 means no timeout.
	int timeout_;

	// index of the input whose first dimension represents the batch size
	int nominal_batch_idx_{0};

	// We bind the op input/output by position while ONNXIFI binds input/output by
	// names. In addition, op input/output names can be written by, for example,
	// memonger. We cache the original input/output name of ONNX object here and
	// bind them by position.
	std::vector<std::string> input_names_;
	std::vector<std::string> output_names_;

	// NetDef of the onnxifi subgraph for shape inference
	NetDef netdef_;

	std::vector<c10::SmallVector<uint64_t, 4>> input_shapes_;
	std::vector<c10::SmallVector<uint64_t, 4>> output_shapes_max_bs_;

	// Mapping of batch sizes to output shapes
	std::unordered_map<int, std::vector<c10::SmallVector<uint64_t, 4>>> output_shapes_per_bs_;

	// Indicate if i-th output is a quantized tensor
	std::vector<bool> quantized_outputs_;

	// This is for multi group quantization info
	std::vector<std::vector<float>> all_scales_;
	std::vector<std::vector<int32_t>> all_offsets_;

	// output shape hints
	std::unordered_map<int, details::TensorInfo> output_shape_hints_;

	// input shape info. Used by shape inference when inputs are not at
	// max_batch_size
	std::unordered_map<std::string, ShapeInfo> input_shape_info_;

	// Whether we should use passed output shape hints or do shape inference
	const bool use_passed_output_shapes_{false};

	// Whether we need to resize outputs or not
	bool adjust_output_batch_{false};

	// Whether we enable tracing in one run of inference
	bool enable_tracing_{false};

	// Adjust the quantized offset to compensate mismatch of certain backend
	uint8_t adjust_quantized_offset_{0};

	// Whether we should read batch size value from Onnxifi request data
	const bool use_onnxifi_batch_size_{false};
	};

	} // namespace caffe2