| #include "concat_dnnlowp_op.h" |
| |
| #ifdef _OPENMP |
| #include <omp.h> |
| #endif |
| |
| #include "dnnlowp_partition.h" |
| |
| namespace caffe2 { |
| |
| using namespace std; |
| |
| template <typename T> |
| ConcatDNNLowPOp<T>::ConcatDNNLowPOp( |
| const OperatorDef& operator_def, |
| Workspace* ws) |
| : BaseType(operator_def, ws) { |
| if (HasArgument("axis")) { |
| axis_ = this->template GetSingleArgument<int>("axis", -1); |
| add_axis_ = this->template GetSingleArgument<int>("add_axis", 0); |
| } else { |
| axis_ = GetDimFromOrderString( |
| this->template GetSingleArgument<string>("order", "NCHW")); |
| add_axis_ = 0; |
| } |
| CAFFE_ENFORCE_GE(axis_, 0); |
| requantization_params_.resize(InputSize()); |
| } |
| |
| template <typename T> |
| bool ConcatDNNLowPOp<T>::RunOnDevice() { |
| GetQuantizationParameters_(); |
| |
| auto* output = OutputTensorCPU_(0); |
| Tensor* split = nullptr; |
| int* axis_data = nullptr; |
| if (OutputSize() >= 2) { |
| split = this->template Output<Tensor>(1, CPU); |
| split->Resize(vector<int64_t>(1, InputSize())); |
| axis_data = split->template mutable_data<int>(); |
| } |
| auto& input_zero = InputTensorCPU_(0); |
| CAFFE_ENFORCE_LT( |
| axis_, |
| input_zero.ndim() + (add_axis_ ? 1 : 0), |
| "Axis not in input ndim range."); |
| for (int i = 1; i < InputSize(); ++i) { |
| CAFFE_ENFORCE( |
| InputTensorCPU_(i).dtype() == input_zero.dtype(), |
| "All inputs must have the same type, expected: ", |
| input_zero.dtype().name(), |
| " but got: ", |
| InputTensorCPU_(i).dtype().name(), |
| " for input: ", |
| i); |
| } |
| |
| int before = 1, after = 1; |
| vector<int64_t> output_dims(input_zero.sizes().vec()); |
| for (int i = 0; i < input_zero.ndim(); ++i) { |
| if (i == axis_ && !add_axis_) { |
| continue; |
| } |
| int dim = input_zero.dim32(i); |
| if (i < axis_) { |
| before *= dim; |
| } else { // i > axis_ || i == axis_ && add_axis_ |
| after *= dim; |
| } |
| // check the input dims are compatible. |
| for (int j = 1; j < InputSize(); ++j) { |
| int dim_j = InputTensorCPU_(j).dim32(i); |
| CAFFE_ENFORCE( |
| dim == dim_j, |
| "Expect dimension = ", |
| dim, |
| " got ", |
| dim_j, |
| " at axis = ", |
| i, |
| " for input: ", |
| j, |
| ". The input tensors can only have different dimensions " |
| "when arg 'add_axis' = 0 and along the axis = ", |
| axis_, |
| " <", |
| InputTensorCPU_(0).sizes(), |
| "> vs <", |
| InputTensorCPU_(j).sizes(), |
| ">."); |
| } |
| } |
| |
| int output_channels = 0; |
| for (int i = 0; i < InputSize(); ++i) { |
| auto dim = add_axis_ ? 1 : InputTensorCPU_(i).dim32(axis_); |
| if (axis_data) { |
| axis_data[i] = dim; |
| } |
| output_channels += dim; |
| } |
| if (add_axis_) { |
| output_dims.insert(output_dims.begin() + axis_, output_channels); |
| } else { |
| output_dims[axis_] = output_channels; |
| } |
| output->Resize(output_dims); |
| size_t output_offset = 0; |
| |
| char* output_data = reinterpret_cast<char*>(GetQuantizedOutputData_()); |
| |
| for (int i = 0; i < InputSize(); ++i) { |
| auto& input = InputTensorCPU_(i); |
| auto axis_dim = add_axis_ ? 1 : input.dim32(axis_); |
| |
| vector<T> input_temp(input.numel()); |
| #ifdef _OPENMP |
| #pragma omp parallel |
| #endif |
| { |
| int nthreads = dnnlowp_get_num_threads(); |
| int tid = dnnlowp_get_thread_num(); |
| // NOLINTNEXTLINE(cppcoreguidelines-init-variables) |
| int before_begin, before_end; |
| // NOLINTNEXTLINE(cppcoreguidelines-init-variables) |
| int after_begin, after_end; |
| |
| Get1DPartitionOf2D( |
| before, |
| axis_dim * after, |
| nthreads, |
| tid, |
| &before_begin, |
| &before_end, |
| &after_begin, |
| &after_end); |
| |
| int j_begin = before_begin * axis_dim * after + after_begin; |
| int j_end = (before_end - 1) * axis_dim * after + after_end; |
| |
| if (InputTensorCPU_(i).template IsType<T>()) { |
| const T* input_data = input.template data<T>(); |
| for (int j = j_begin; j < j_end; ++j) { |
| input_temp[j] = fbgemm::Requantize<T>( |
| input_data[j] - in_qparams_[i].zero_point, |
| requantization_params_[i]); |
| } |
| } else { |
| fbgemm::Quantize<T>( |
| input.template data<float>() + j_begin, |
| input_temp.data() + j_begin, |
| j_end - j_begin, |
| out_qparams_); |
| } |
| |
| math::CopyMatrix<CPUContext>( |
| sizeof(T), |
| before_end - before_begin, |
| after_end - after_begin, |
| input_temp.data() + before_begin * axis_dim * after + after_begin, |
| axis_dim * after, |
| output_data + output_offset + before_begin * output_channels * after + |
| after_begin * sizeof(T), |
| output_channels * after, |
| &context_, |
| input_zero.dtype().copy()); |
| } |
| |
| output_offset += axis_dim * after * sizeof(T); |
| } |
| |
| RunOnDeviceEpilogue_(); |
| |
| return true; |
| } |
| |
| template <typename T> |
| void ConcatDNNLowPOp<T>::GetQuantizationParameters_() { |
| using namespace dnnlowp; |
| for (int i = 0; i < InputSize(); ++i) { |
| in_qparams_[i] = |
| GetInputTensorQuantizationParamsOf(this, i, qfactory_.get()); |
| } |
| |
| GetOutputQuantizationParams_(); |
| |
| for (int i = 0; i < InputSize(); ++i) { |
| float real_multiplier = in_qparams_[i].scale / out_qparams_.scale; |
| requantization_params_[i] = qfactory_->ChooseRequantizationMultiplier( |
| real_multiplier, out_qparams_); |
| } |
| } |
| |
| REGISTER_CPU_OPERATOR_WITH_ENGINE(Concat, DNNLOWP, ConcatDNNLowPOp<uint8_t>); |
| REGISTER_CPU_OPERATOR_WITH_ENGINE( |
| Int8Concat, |
| DNNLOWP, |
| ConcatDNNLowPOp<uint8_t>); |
| |
| } // namespace caffe2 |