blob: bf6005f97907b9a8c668ee1ffad192674a4ac898 [file] [log] [blame]
#include "caffe2/opt/onnxifi_op.h"
#include "caffe2/operators/slice_op.h"
#include "caffe2/opt/bound_shape_inferencer.h"
#include <c10/util/irange.h>
namespace caffe2 {
namespace {
void setInputTensorDescriptorTypeAndBuffer(
const Tensor& cpu_tensor,
onnxTensorDescriptorV1* desc) {
if (cpu_tensor.template IsType<int32_t>()) {
desc->dataType = ONNXIFI_DATATYPE_INT32;
desc->buffer = reinterpret_cast<onnxPointer>(cpu_tensor.data<int32_t>());
} else if (cpu_tensor.template IsType<c10::Half>()) {
desc->dataType = ONNXIFI_DATATYPE_FLOAT16;
desc->buffer = reinterpret_cast<onnxPointer>(cpu_tensor.data<c10::Half>());
} else if (cpu_tensor.template IsType<float>()) {
desc->dataType = ONNXIFI_DATATYPE_FLOAT32;
desc->buffer = reinterpret_cast<onnxPointer>(cpu_tensor.data<float>());
} else if (cpu_tensor.template IsType<int8_t>()) {
desc->dataType = ONNXIFI_DATATYPE_INT8;
desc->buffer = reinterpret_cast<onnxPointer>(cpu_tensor.data<int8_t>());
} else if (cpu_tensor.template IsType<uint8_t>()) {
desc->dataType = ONNXIFI_DATATYPE_UINT8;
desc->buffer = reinterpret_cast<onnxPointer>(cpu_tensor.data<uint8_t>());
} else if (cpu_tensor.template IsType<int64_t>()) {
desc->dataType = ONNXIFI_DATATYPE_INT64;
desc->buffer = reinterpret_cast<onnxPointer>(cpu_tensor.data<int64_t>());
} else if (cpu_tensor.template IsType<int16_t>()) {
desc->dataType = ONNXIFI_DATATYPE_INT16;
desc->buffer = reinterpret_cast<onnxPointer>(cpu_tensor.data<int16_t>());
} else if (cpu_tensor.template IsType<uint16_t>()) {
desc->dataType = ONNXIFI_DATATYPE_UINT16;
desc->buffer = reinterpret_cast<onnxPointer>(cpu_tensor.data<uint16_t>());
} else {
CAFFE_THROW(
"Unsupported tensor type in ONNXIFI: ", cpu_tensor.dtype().name());
}
}
void setInputTensorDescriptorTypeAndBuffer(
const int8::Int8TensorCPU& cpu_int8tensor,
onnxTensorDescriptorV1* desc) {
const Tensor& cpu_tensor = cpu_int8tensor.t;
if (cpu_tensor.template IsType<uint8_t>()) {
desc->dataType = ONNXIFI_DATATYPE_UINT8;
desc->buffer = reinterpret_cast<onnxPointer>(cpu_tensor.data<uint8_t>());
} else if (cpu_tensor.template IsType<int8_t>()) {
desc->dataType = ONNXIFI_DATATYPE_INT8;
desc->buffer = reinterpret_cast<onnxPointer>(cpu_tensor.data<int8_t>());
} else if (cpu_tensor.template IsType<int32_t>()) {
desc->dataType = ONNXIFI_DATATYPE_INT32;
desc->buffer = reinterpret_cast<onnxPointer>(cpu_tensor.data<int32_t>());
} else {
CAFFE_THROW(
"Unsupported Int8Tensor type in ONNXIFI: ", cpu_tensor.dtype().name());
}
desc->quantizationParams = 1;
desc->quantizationAxis = 1;
desc->scales = &cpu_int8tensor.scale;
desc->biases = &cpu_int8tensor.zero_point;
}
template <typename T>
void adjustQuantizedOffsetImpl(Tensor* t, uint8_t offset) {
auto* data = t->mutable_data<T>();
for (auto i: c10::irange(t->numel())) {
data[i] -= offset;
}
}
void adjustQuantizedOffset(Tensor* t, uint8_t offset) {
if (t->template IsType<uint8_t>()) {
adjustQuantizedOffsetImpl<uint8_t>(t, offset);
}
}
TypeMeta OnnxifiTypeToDataType(uint64_t onnxifi_type) {
static std::map<uint64_t, TypeMeta> data_type_map{
{ONNXIFI_DATATYPE_FLOAT32, TypeMeta::Make<float>()},
{ONNXIFI_DATATYPE_FLOAT16, TypeMeta::Make<c10::Half>()},
{ONNXIFI_DATATYPE_INT32, TypeMeta::Make<int>()},
{ONNXIFI_DATATYPE_INT8, TypeMeta::Make<int8_t>()},
{ONNXIFI_DATATYPE_UINT8, TypeMeta::Make<uint8_t>()},
{ONNXIFI_DATATYPE_INT64, TypeMeta::Make<int64_t>()},
{ONNXIFI_DATATYPE_INT16, TypeMeta::Make<int16_t>()},
{ONNXIFI_DATATYPE_UINT16, TypeMeta::Make<uint16_t>()},
};
const auto it = data_type_map.find(onnxifi_type);
CAFFE_ENFORCE(
it != data_type_map.end(),
"Unsupported ONNXIFI data type: ",
onnxifi_type);
return it->second;
}
void setOutputTensorDescriptorTypeAndBuffer(
uint64_t onnxifi_type,
Tensor* cpu_tensor,
onnxTensorDescriptorV1* desc) {
desc->dataType = onnxifi_type;
desc->buffer = reinterpret_cast<onnxPointer>(
cpu_tensor->raw_mutable_data(OnnxifiTypeToDataType(onnxifi_type)));
}
#ifndef C10_MOBILE
void copyDescriptor(
const ExternalTensorDescriptor* from,
onnxTensorDescriptorV1* to) {
to->dataType = from->dataType;
to->buffer = from->buffer;
to->isOffline = from->isOffline;
to->quantizationParams = from->quantizationParams;
to->quantizationAxis = from->quantizationAxis;
to->scales = from->scales;
to->biases = from->biases;
to->dimensions = from->dimensions;
to->shape = from->shape;
}
#endif
void BlobToTensorDescriptor(
const std::string& name,
Workspace* ws,
onnxTensorDescriptorV1* desc,
std::vector<std::vector<uint64_t>>* shapes,
std::vector<std::vector<float>>* all_scales,
std::vector<std::vector<int32_t>>* all_offsets) {
const Blob* blob = ws->GetBlob(name);
CAFFE_ENFORCE(blob, "Blob ", name, " doesn't exist");
const bool is_int8tensor =
blob->meta().id() == TypeMeta::Id<int8::Int8TensorCPU>();
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
bool is_external_tensor;
#ifndef C10_MOBILE
auto function_ptr =
ExternalTensorFunctionsBaseRegistry()->Create(blob->meta().id());
is_external_tensor = function_ptr != nullptr;
#else
is_external_tensor = false;
#endif
// Memory type
// We only allow weights to be CPU tensor or int8tensor for now
CAFFE_ENFORCE(
(BlobIsTensorType(*blob, CPU) || BlobIsInt8TensorCPUType(*blob) ||
is_external_tensor),
"Initialization blob ",
name,
" needs to be TensorCPU or Int8TensorCPU or Int8FCDNNLowPPackedWeightBlob Based class: ",
blob->TypeName());
desc->tag = ONNXIFI_TAG_TENSOR_DESCRIPTOR_V1;
desc->memoryType = ONNXIFI_MEMORY_TYPE_CPU;
desc->isOffline = false;
if (is_int8tensor) {
// Data type
const auto& cpu_int8tensor = blob->template Get<int8::Int8TensorCPU>();
const auto& cpu_tensor = cpu_int8tensor.t;
setInputTensorDescriptorTypeAndBuffer(cpu_int8tensor, desc);
// Set dims
const auto shape = cpu_tensor.sizes();
desc->dimensions = shape.size();
shapes->emplace_back(shape.cbegin(), shape.cend());
desc->shape = shapes->back().data();
} else if (is_external_tensor) {
#ifndef C10_MOBILE
ExternalTensorDescriptor ext_desc;
function_ptr->SetupExternalTensorDescriptor(
blob, shapes, all_scales, all_offsets, &ext_desc);
copyDescriptor(&ext_desc, desc);
#endif
} else {
// Data type
const auto& cpu_tensor = blob->template Get<TensorCPU>();
setInputTensorDescriptorTypeAndBuffer(cpu_tensor, desc);
// Set dims
const auto shape = cpu_tensor.sizes();
desc->dimensions = shape.size();
shapes->emplace_back(shape.cbegin(), shape.cend());
desc->shape = shapes->back().data();
desc->quantizationParams = 0;
}
}
uint64_t getOnnxifiDataType(caffe2::TensorProto::DataType t) {
#define CAFFE2_TO_ONNXIFI_TYPE(x) \
case (caffe2::TensorProto::x): \
return ONNXIFI_DATATYPE_##x
switch (t) {
CAFFE2_TO_ONNXIFI_TYPE(INT8);
CAFFE2_TO_ONNXIFI_TYPE(UINT8);
CAFFE2_TO_ONNXIFI_TYPE(UINT16);
CAFFE2_TO_ONNXIFI_TYPE(INT16);
CAFFE2_TO_ONNXIFI_TYPE(INT32);
CAFFE2_TO_ONNXIFI_TYPE(INT64);
CAFFE2_TO_ONNXIFI_TYPE(FLOAT16);
case (caffe2::TensorProto::FLOAT):
return ONNXIFI_DATATYPE_FLOAT32;
default:
LOG(WARNING) << "Unsupported Caffe2 tensor type: " << t;
return ONNXIFI_DATATYPE_UNDEFINED;
}
#undef CAFFE2_TO_ONNXIFI_TYPE
}
} // namespace
namespace details {
TensorInfo::TensorInfo(const TensorProto& t)
: onnxifi_type(getOnnxifiDataType(t.data_type())),
quantized(false),
quantizationAxis(0),
quantizationParams(0) {
for (const auto d : t.dims()) {
dims.push_back(d);
}
}
TensorInfo::TensorInfo(const QTensorProto& t)
: onnxifi_type(getOnnxifiDataType(t.data_type())),
quantized(true),
quantizationAxis(t.has_axis() ? t.axis() : 0),
quantizationParams(t.scales_size() ? t.scales_size() : 1) {
for (const auto d : t.dims()) {
dims.push_back(d);
}
if (t.scales_size()) {
for (const auto d : t.scales()) {
scales.push_back(static_cast<float>(d));
}
for (const auto d : t.biases()) {
biases.push_back(static_cast<int32_t>(d));
}
} else {
scales.push_back(static_cast<float>(t.scale()));
biases.push_back(static_cast<int32_t>(t.bias()));
}
}
} // namespace details
template <>
std::vector<onnxTensorDescriptorV1>
OnnxifiOp<CPUContext>::buildInitializationList(
Workspace* ws,
const std::vector<std::string>& initializers,
std::vector<std::string>* weight_names,
std::vector<std::vector<uint64_t>>* weight_shapes,
std::vector<std::vector<float>>* all_scales,
std::vector<std::vector<int32_t>>* all_offsets) const {
std::unordered_set<std::string> initialization_list(
initializers.begin(), initializers.end());
const std::vector<string>& ws_blobs = ws->Blobs();
// Since onnxTensorDescriptorV1.name will point into the memory in
// weight_names, we need to prevent weight_names from reallocating by
// reserving enough memory ahead of time
weight_names->reserve(ws_blobs.size());
std::vector<onnxTensorDescriptorV1> descs;
for (const auto& s : ws_blobs) {
auto it = initialization_list.find(s);
if (it != initialization_list.end()) {
weight_names->emplace_back(s);
onnxTensorDescriptorV1 tensor_desc;
tensor_desc.name = weight_names->back().c_str();
BlobToTensorDescriptor(
s, ws, &tensor_desc, weight_shapes, all_scales, all_offsets);
descs.push_back(tensor_desc);
initialization_list.erase(it);
}
}
CAFFE_ENFORCE(initialization_list.empty(), "Unfulfilled initialization list");
return descs;
}
template <>
details::OutputReshapeInfo OnnxifiOp<CPUContext>::initOutputReshapeInfo()
const {
details::OutputReshapeInfo output_reshape_info;
output_reshape_info.begins.reserve(output_names_.size());
output_reshape_info.ends.reserve(output_names_.size());
output_reshape_info.fast_path.reserve(output_names_.size());
for (auto i: c10::irange(output_names_.size())) {
const auto it = output_shape_hints_.find(i);
CAFFE_ENFORCE(
it != output_shape_hints_.end(),
"Cannot find output shape hints for ",
output_names_[i]);
int64_t num_dims = it->second.dims.size();
// Initialize the tensors used to slice the output
output_reshape_info.begins.emplace_back();
ReinitializeTensor(
&output_reshape_info.begins.back(),
{num_dims},
at::dtype<int32_t>().device(CPU));
output_reshape_info.ends.emplace_back();
ReinitializeTensor(
&output_reshape_info.ends.back(),
{num_dims},
at::dtype<int32_t>().device(CPU));
}
return output_reshape_info;
}
template <>
template <typename DimContainer>
void OnnxifiOp<CPUContext>::fillOutputReshapeInfo(
const DimContainer& real_shape,
c10::ArrayRef<uint64_t> max_shape,
details::OutputReshapeInfo& output_reshape_info,
int currentIndex) {
CAFFE_ENFORCE_EQ(real_shape.size(), max_shape.size());
const auto dim_size = real_shape.size();
auto& begin = output_reshape_info.begins[currentIndex];
begin.Resize(dim_size);
int32_t* begin_ptr = begin.template mutable_data<int32_t>();
auto& end = output_reshape_info.ends[currentIndex];
end.Resize(dim_size);
int32_t* end_ptr = end.template mutable_data<int32_t>();
int32_t mismatch = 0;
for (auto j: c10::irange(dim_size)) {
CAFFE_ENFORCE_GE(
max_shape[j],
real_shape[j],
"It is weird that max shape of ",
output_names_[currentIndex],
" is smaller than real shape at dim ",
j,
" (",
max_shape[j],
" vs ",
real_shape[j],
")");
begin_ptr[j] = 0;
if (max_shape[j] > static_cast<uint64_t>(real_shape[j])) {
end_ptr[j] = real_shape[j];
mismatch += j;
} else {
end_ptr[j] = max_shape[j];
}
}
if (dim_size > 0) {
output_reshape_info.fast_path[currentIndex] = !mismatch;
} else {
output_reshape_info.fast_path[currentIndex] = false;
}
}
template <>
void OnnxifiOp<CPUContext>::extractOutputBatchSizes(int current_batch_size) {
auto& output_reshape_info =
output_reshape_info_.emplace(current_batch_size, initOutputReshapeInfo())
.first->second;
if (use_passed_output_shapes_) {
const auto shape_info_it = output_shapes_per_bs_.find(current_batch_size);
CAFFE_ENFORCE(
shape_info_it != output_shapes_per_bs_.end(),
"Unable to find outputs shapes for bs=",
current_batch_size);
CAFFE_ENFORCE_EQ(shape_info_it->second.size(), OutputSize());
for (int i = 0; i < OutputSize(); ++i) {
fillOutputReshapeInfo(
shape_info_it->second[i],
output_shapes_max_bs_[i],
output_reshape_info,
i);
}
} else {
BoundShapeSpec spec(current_batch_size, max_seq_size_);
auto bound_shape_inferencer =
BoundShapeInferencerRegistry()->Create("C10", spec);
for (int i = 0; i < InputSize(); ++i) {
at::IntArrayRef dim0;
bool quantized = false;
if (this->template InputIsType<int8::Int8TensorCPU>(i)) {
const auto& input_tensor_int8 =
this->template Input<int8::Int8TensorCPU>(i);
const auto& t0 = input_tensor_int8.t;
dim0 = t0.sizes();
quantized = true;
} else {
const auto& t0 = Input(i);
dim0 = t0.sizes();
}
TensorShape shape;
for (const auto d : dim0) {
shape.add_dims(d);
}
std::vector<TensorBoundShape::DimType> dim_type(
shape.dims_size(), TensorBoundShape_DimType_CONSTANT);
if (dim_type.size()) {
dim_type[0] = TensorBoundShape_DimType_BATCH;
}
input_shape_info_[input_names_[i]] =
ShapeInfo(dim_type, std::move(shape), quantized);
}
bound_shape_inferencer->InferBoundShapeAndType(
netdef_, input_shape_info_, nullptr, false);
const auto& shape_info = bound_shape_inferencer->shape_info();
for (int i = 0; i < OutputSize(); ++i) {
const auto find_res = shape_info.find(output_names_[i]);
CAFFE_ENFORCE(find_res != shape_info.end());
fillOutputReshapeInfo(
find_res->second.shape.dims(),
output_shapes_max_bs_[i],
output_reshape_info,
i);
}
}
}
template <>
int OnnxifiOp<CPUContext>::extractOutputBatchSizes() {
if (use_onnx_ || !adjust_output_batch_) {
return max_batch_size_;
}
// Get the real batch size from nominal input. If it's equal to
// max_batch_size, mark that we don't need to adjust batch size and return.
// Otherwise, do a pass of shape inference to get the real shapes of the
// outputs.
const Tensor* t = nullptr;
if (this->template InputIsType<int8::Int8TensorCPU>(nominal_batch_idx_)) {
const auto& input_tensor_int8 =
this->template Input<int8::Int8TensorCPU>(nominal_batch_idx_);
t = &input_tensor_int8.t;
} else {
t = &Input(nominal_batch_idx_);
}
CAFFE_ENFORCE(
t, "Null input shape tensor ptr. Possibly unsupported tensor type");
CAFFE_ENFORCE(
!t->sizes().empty(),
input_names_[nominal_batch_idx_],
" cannot be empty");
const auto dims = t->sizes();
const int current_batch_size = dims[0];
if (current_batch_size == max_batch_size_) {
return max_batch_size_;
}
// We still need to adjust output size but we can skip the shape inference as
// it was done before.
if (output_reshape_info_.count(current_batch_size)) {
return current_batch_size;
}
extractOutputBatchSizes(current_batch_size);
return current_batch_size;
}
template <>
void OnnxifiOp<CPUContext>::adjustOutputBatchSizes(int current_batch_size) {
auto it = output_reshape_info_.find(current_batch_size);
CAFFE_ENFORCE(
it != output_reshape_info_.end(),
"Cannot find current_batch_size ",
current_batch_size,
" in output_reshape_info_");
const auto& output_reshape_info = it->second;
CPUContext context;
Tensor tmp(CPU);
for (int i = 0; i < OutputSize(); ++i) {
Tensor* output_tensor = quantized_outputs_[i]
? (&this->template Output<int8::Int8TensorCPU>(i)->t)
: Output(i);
const auto& end = output_reshape_info.ends[i];
if (output_reshape_info.fast_path[i]) {
output_tensor->ShrinkTo(end.data<int32_t>()[0]);
} else {
// We need to use generic Slice
SliceImpl<int32_t, CPUContext>(
&tmp, *output_tensor, output_reshape_info.begins[i], end, &context);
output_tensor->CopyFrom(tmp);
}
}
}
template <>
void OnnxifiOp<CPUContext>::setOutputShapeAndType(
int output_idx,
c10::SmallVector<int64_t, 4>& tensor_dims_int64) {
tensor_dims_int64.clear();
std::vector<size_t> tensor_dims;
uint64_t type = ONNXIFI_DATATYPE_FLOAT32;
const auto it = output_shape_hints_.find(output_idx);
CAFFE_ENFORCE(
it != output_shape_hints_.end(),
"Cannot find shape hint for output: ",
output_names_[output_idx]);
const auto& info = it->second;
std::copy(
info.dims.begin(), info.dims.end(), std::back_inserter(tensor_dims));
type = it->second.onnxifi_type;
auto& tensor_descriptor = output_desc_[output_idx];
tensor_descriptor.tag = ONNXIFI_TAG_TENSOR_DESCRIPTOR_V1;
tensor_descriptor.memoryType = ONNXIFI_MEMORY_TYPE_CPU;
tensor_descriptor.dimensions = tensor_dims.size();
CAFFE_ENFORCE(
tensor_descriptor.dimensions != 0, tensor_descriptor.name, " has 0 dim");
auto& output_shape = output_shapes_max_bs_[output_idx];
output_shape.clear();
output_shape.insert(
output_shape.begin(), tensor_dims.cbegin(), tensor_dims.cend());
tensor_descriptor.shape = output_shape.data();
std::copy(
tensor_dims.cbegin(),
tensor_dims.cend(),
std::back_inserter(tensor_dims_int64));
// Setup the output C2 tensor
if (!info.quantized) {
// Normal Tensor
auto* output_tensor = Output(
output_idx,
tensor_dims_int64,
at::dtype(OnnxifiTypeToDataType(type)).device(CPU));
setOutputTensorDescriptorTypeAndBuffer(
type, output_tensor, &tensor_descriptor);
} else if (info.quantizationParams == 1) {
// single quantizer, output Int8Tensor
auto* output_tensor =
this->template Output<int8::Int8TensorCPU>(output_idx);
output_tensor->t.Resize(tensor_dims_int64);
setOutputTensorDescriptorTypeAndBuffer(
type, &output_tensor->t, &tensor_descriptor);
tensor_descriptor.quantizationParams = 1;
tensor_descriptor.quantizationAxis = 1;
tensor_descriptor.scales = &output_tensor->scale;
tensor_descriptor.biases = &output_tensor->zero_point;
} else {
CAFFE_THROW(
"OnnxifiOp does not support output tensor with multi-quantization params: ",
output_names_[output_idx]);
}
}
string mapOnnxStateToString(onnxEventState state) {
switch (state) {
case ONNXIFI_EVENT_STATE_NONSIGNALLED:
return "ONNXIFI_EVENT_STATE_NONSIGNALLED";
default:
return "ONNXIFI_EVENT_STATE_STRING_NOT_MAPPED";
}
}
string mapOnnxStatusToString(onnxStatus status) {
switch (status) {
case ONNXIFI_STATUS_SUCCESS:
return "ONNXIFI_STATUS_SUCCESS";
case ONNXIFI_STATUS_FALLBACK:
return "ONNXIFI_STATUS_FALLBACK";
case ONNXIFI_STATUS_INVALID_ID:
return "ONNXIFI_STATUS_INVALID_ID";
case ONNXIFI_STATUS_INVALID_SIZE:
return "ONNXIFI_STATUS_INVALID_SIZE";
case ONNXIFI_STATUS_INVALID_POINTER:
return "ONNXIFI_STATUS_INVALID_POINTER";
case ONNXIFI_STATUS_INVALID_PROTOBUF:
return "ONNXIFI_STATUS_INVALID_PROTOBUF";
case ONNXIFI_STATUS_INVALID_MODEL:
return "ONNXIFI_STATUS_INVALID_MODEL";
case ONNXIFI_STATUS_INVALID_BACKEND:
return "ONNXIFI_STATUS_INVALID_BACKEND";
case ONNXIFI_STATUS_INVALID_GRAPH:
return "ONNXIFI_STATUS_INVALID_GRAPH";
case ONNXIFI_STATUS_INVALID_EVENT:
return "ONNXIFI_STATUS_INVALID_EVENT";
case ONNXIFI_STATUS_INVALID_STATE:
return "ONNXIFI_STATUS_INVALID_STATE";
case ONNXIFI_STATUS_INVALID_NAME:
return "ONNXIFI_STATUS_INVALID_NAME";
case ONNXIFI_STATUS_INVALID_SHAPE:
return "ONNXIFI_STATUS_INVALID_SHAPE";
case ONNXIFI_STATUS_INVALID_DATATYPE:
return "ONNXIFI_STATUS_INVALID_DATATYPE";
case ONNXIFI_STATUS_INVALID_MEMORY_TYPE:
return "ONNXIFI_STATUS_INVALID_MEMORY_TYPE";
case ONNXIFI_STATUS_INVALID_MEMORY_LOCATION:
return "ONNXIFI_STATUS_INVALID_MEMORY_LOCATION";
case ONNXIFI_STATUS_INVALID_FENCE_TYPE:
return "ONNXIFI_STATUS_INVALID_FENCE_TYPE";
case ONNXIFI_STATUS_INVALID_PROPERTY:
return "ONNXIFI_STATUS_INVALID_PROPERTY";
case ONNXIFI_STATUS_UNSUPPORTED_TAG:
return "ONNXIFI_STATUS_UNSUPPORTED_TAG";
case ONNXIFI_STATUS_UNSUPPORTED_VERSION:
return "ONNXIFI_STATUS_UNSUPPORTED_VERSION";
case ONNXIFI_STATUS_UNSUPPORTED_OPERATOR:
return "ONNXIFI_STATUS_UNSUPPORTED_OPERATOR";
case ONNXIFI_STATUS_UNSUPPORTED_ATTRIBUTE:
return "ONNXIFI_STATUS_UNSUPPORTED_ATTRIBUTE";
case ONNXIFI_STATUS_UNSUPPORTED_SHAPE:
return "ONNXIFI_STATUS_UNSUPPORTED_SHAPE";
case ONNXIFI_STATUS_UNSUPPORTED_DATATYPE:
return "ONNXIFI_STATUS_UNSUPPORTED_DATATYPE";
case ONNXIFI_STATUS_UNSUPPORTED_MEMORY_TYPE:
return "ONNXIFI_STATUS_UNSUPPORTED_MEMORY_TYPE";
case ONNXIFI_STATUS_UNSUPPORTED_FENCE_TYPE:
return "ONNXIFI_STATUS_UNSUPPORTED_FENCE_TYPE";
case ONNXIFI_STATUS_UNSUPPORTED_PROPERTY:
return "ONNXIFI_STATUS_UNSUPPORTED_PROPERTY";
case ONNXIFI_STATUS_UNIDENTIFIED_NAME:
return "ONNXIFI_STATUS_UNIDENTIFIED_NAME";
case ONNXIFI_STATUS_MISMATCHING_SHAPE:
return "ONNXIFI_STATUS_MISMATCHING_SHAPE";
case ONNXIFI_STATUS_MISMATCHING_DATATYPE:
return "ONNXIFI_STATUS_MISMATCHING_DATATYPE";
case ONNXIFI_STATUS_NO_SYSTEM_MEMORY:
return "ONNXIFI_STATUS_NO_SYSTEM_MEMORY";
case ONNXIFI_STATUS_NO_DEVICE_MEMORY:
return "ONNXIFI_STATUS_NO_DEVICE_MEMORY";
case ONNXIFI_STATUS_NO_SYSTEM_RESOURCES:
return "ONNXIFI_STATUS_NO_SYSTEM_RESOURCES";
case ONNXIFI_STATUS_NO_DEVICE_RESOURCES:
return "ONNXIFI_STATUS_NO_DEVICE_RESOURCES";
case ONNXIFI_STATUS_BACKEND_UNAVAILABLE:
return "ONNXIFI_STATUS_BACKEND_UNAVAILABLE";
case ONNXIFI_STATUS_INTERNAL_ERROR:
return "ONNXIFI_STATUS_INTERNAL_ERROR";
case ONNXIFI_STATUS_FATAL_ERROR:
return "ONNXIFI_STATUS_FATAL_ERROR";
default:
return "ONNXIFI_STATUS_STRING_NOT_MAPPED";
}
}
template <>
bool OnnxifiOp<CPUContext>::RunOnDevice() {
CAFFE_ENFORCE_EQ(input_desc_.size(), InputSize());
for (auto i: c10::irange(InputSize())) {
auto& tensor_descriptor = input_desc_[i];
tensor_descriptor.tag = ONNXIFI_TAG_TENSOR_DESCRIPTOR_V1;
tensor_descriptor.memoryType = ONNXIFI_MEMORY_TYPE_CPU;
at::IntArrayRef tensor_dims;
// NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
if (this->template InputIsType<int8::Int8TensorCPU>(i)) {
const auto& input_tensor_int8 =
// NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
this->template Input<int8::Int8TensorCPU>(i);
const auto& cpu_tensor = input_tensor_int8.t;
tensor_dims = cpu_tensor.sizes();
setInputTensorDescriptorTypeAndBuffer(
input_tensor_int8, &tensor_descriptor);
} else {
// NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
const auto& input_tensor = Input(i);
tensor_dims = input_tensor.sizes();
setInputTensorDescriptorTypeAndBuffer(input_tensor, &tensor_descriptor);
}
auto& input_shape = input_shapes_[i];
input_shape.clear();
input_shape.insert(
input_shape.begin(), tensor_dims.cbegin(), tensor_dims.cend());
tensor_descriptor.dimensions = tensor_dims.size();
tensor_descriptor.shape = input_shape.data();
}
CAFFE_ENFORCE_EQ(output_desc_.size(), OutputSize());
c10::SmallVector<int64_t, 4> tensor_dims_int64;
for (auto i: c10::irange(OutputSize())) {
setOutputShapeAndType(i, tensor_dims_int64);
}
bool ext_supported = false;
onnxMemoryFenceV1 input_fence;
onnxMemoryFenceV1 output_fence;
std::vector<int> output_batch_sizes;
int current_batch_size = max_batch_size_;
#ifdef ONNXIFI_ENABLE_EXT
/**
* If onnxifi extension mode is enabled,
* and onnxSetIOAndRunGraph is supported in backend,
* then we run through this workflow;
* Else we fallback to non-onnxifi-extension workflow.
**/
if (onnxSetIOAndRunGraphPointer_ != nullptr) {
ext_supported = true;
output_fence.tag = ONNXIFI_TAG_MEMORY_FENCE_V1;
output_fence.type = ONNXIFI_SYNCHRONIZATION_EVENT;
traces_.reset();
if (enable_tracing_) {
traces_ = std::shared_ptr<onnxTraceEventList>(
new onnxTraceEventList(), [this](onnxTraceEventList* p) {
if (p && onnxReleaseTraceEventsPointer_) {
CAFFE_ENFORCE_EQ(
(*onnxReleaseTraceEventsPointer_)(p), ONNXIFI_STATUS_SUCCESS);
}
delete p;
});
traces_->numEvents = 0;
}
const onnxStatus status = (*onnxSetIOAndRunGraphPointer_)(
graph_,
input_desc_.size(),
input_desc_.data(),
output_desc_.size(),
output_desc_.data(),
&output_fence,
traces_.get());
CAFFE_ENFORCE_EQ(
status,
ONNXIFI_STATUS_SUCCESS,
"Reason: onnxSetIOAndRunGraph returned status code ",
mapOnnxStatusToString(status));
// Check if we should rely on Onnxifi to provide current batch size
if (use_onnxifi_batch_size_ && onnxGetCurrentBatchSizePointer_ != nullptr) {
int64_t onnxifiBatchSize;
if ((*onnxGetCurrentBatchSizePointer_)(&onnxifiBatchSize) == ONNXIFI_STATUS_SUCCESS) {
current_batch_size = onnxifiBatchSize;
if (current_batch_size != max_batch_size_ &&
output_reshape_info_.count(current_batch_size) == 0) {
extractOutputBatchSizes(current_batch_size);
}
} else {
current_batch_size = extractOutputBatchSizes();
}
} else {
current_batch_size = extractOutputBatchSizes();
}
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
onnxEventState eventState;
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
onnxStatus eventStatus;
std::string message;
size_t messageLength = 512;
message.resize(messageLength);
CAFFE_ENFORCE_EQ(
(*onnxWaitEventForPointer_)(
output_fence.event,
timeout_,
&eventState,
&eventStatus,
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
const_cast<char*>(message.data()),
&messageLength),
ONNXIFI_STATUS_SUCCESS);
CAFFE_ENFORCE_EQ(
eventState,
ONNXIFI_EVENT_STATE_SIGNALLED,
"Onnxifi run timeouted out after ",
timeout_,
" ms.",
"Reason: Onnxifi run returned event state code ",
mapOnnxStateToString(eventState));
if (eventStatus != ONNXIFI_STATUS_SUCCESS) {
if (messageLength == 0) {
CAFFE_THROW("onnxifi internal error");
} else {
CAFFE_THROW(message);
}
}
CAFFE_ENFORCE_EQ(
lib_->onnxReleaseEvent(output_fence.event), ONNXIFI_STATUS_SUCCESS);
}
#endif
if (!ext_supported) {
CAFFE_ENFORCE_EQ(
lib_->onnxSetGraphIO(
graph_,
input_desc_.size(),
input_desc_.data(),
output_desc_.size(),
output_desc_.data()),
ONNXIFI_STATUS_SUCCESS);
input_fence.tag = ONNXIFI_TAG_MEMORY_FENCE_V1;
input_fence.type = ONNXIFI_SYNCHRONIZATION_EVENT;
CAFFE_ENFORCE_EQ(
lib_->onnxInitEvent(backend_, &input_fence.event),
ONNXIFI_STATUS_SUCCESS);
output_fence.tag = ONNXIFI_TAG_MEMORY_FENCE_V1;
output_fence.type = ONNXIFI_SYNCHRONIZATION_EVENT;
// Call the async run on backend, signal event on input fence and wait for
// the event on output fence
CAFFE_ENFORCE_EQ(
lib_->onnxRunGraph(graph_, &input_fence, &output_fence),
ONNXIFI_STATUS_SUCCESS);
CAFFE_ENFORCE_EQ(
lib_->onnxSignalEvent(input_fence.event), ONNXIFI_STATUS_SUCCESS);
current_batch_size = extractOutputBatchSizes();
CAFFE_ENFORCE_EQ(
lib_->onnxWaitEvent(output_fence.event), ONNXIFI_STATUS_SUCCESS);
// Destroy the event objects
CAFFE_ENFORCE_EQ(
lib_->onnxReleaseEvent(input_fence.event), ONNXIFI_STATUS_SUCCESS);
CAFFE_ENFORCE_EQ(
lib_->onnxReleaseEvent(output_fence.event), ONNXIFI_STATUS_SUCCESS);
}
if (adjust_quantized_offset_) {
for (auto i: c10::irange(OutputSize())) {
if (quantized_outputs_[i]) {
auto* int8_tensor = this->template Output<int8::Int8TensorCPU>(i);
int8_tensor->zero_point += adjust_quantized_offset_;
adjustQuantizedOffset(&int8_tensor->t, adjust_quantized_offset_);
}
}
}
if (adjust_output_batch_ && current_batch_size != max_batch_size_) {
adjustOutputBatchSizes(current_batch_size);
}
enable_tracing_ = false;
return true;
}
REGISTER_CPU_OPERATOR(Onnxifi, OnnxifiOp<CPUContext>);
OPERATOR_SCHEMA(Onnxifi)
.NumInputs(0, INT_MAX)
.NumOutputs(0, INT_MAX)
.SetDoc(R"DOC(
The Onnxifi operator is a black-box operator to lower the computation to Onnxifi backend
)DOC")
.Arg(
"onnx_model",
"(string default=\"\") Serialized ONNX model to be converted to backend representation")
.Arg(
"initializers",
"Initialization pair indicating the mapping of the name between NetDef and ONNX model")
.Arg(
"output_resize_hints",
"A list of key/value pairs indicating which input index to look up for real batch size for the given max output batch size");
} // namespace caffe2