| /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. |
| |
| Licensed under the Apache License, Version 2.0 (the "License"); |
| you may not use this file except in compliance with the License. |
| You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| ==============================================================================*/ |
| #ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_ |
| #define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_ |
| |
| #include <stdint.h> |
| #include <sys/types.h> |
| #include <algorithm> |
| #include <cmath> |
| #include <functional> |
| #include <limits> |
| #include <memory> |
| #include <type_traits> |
| |
| #include "fixedpoint/fixedpoint.h" |
| #include "public/gemmlowp.h" |
| #include "tensorflow/contrib/lite/kernels/internal/common.h" |
| #include "tensorflow/contrib/lite/kernels/internal/quantization_util.h" |
| #include "tensorflow/contrib/lite/kernels/internal/round.h" |
| #include "tensorflow/contrib/lite/kernels/internal/strided_slice_logic.h" |
| #include "tensorflow/contrib/lite/kernels/internal/types.h" |
| |
| namespace tflite { |
| |
| // TODO(b/77858996): Add these to gemmlowp. |
| template <typename IntegerType> |
| IntegerType SaturatingAddNonGemmlowp(IntegerType a, IntegerType b) { |
| static_assert(std::is_same<IntegerType, void>::value, "unimplemented"); |
| return a; |
| } |
| |
| template <> |
| inline std::int32_t SaturatingAddNonGemmlowp(std::int32_t a, std::int32_t b) { |
| std::int64_t a64 = a; |
| std::int64_t b64 = b; |
| std::int64_t sum = a64 + b64; |
| return static_cast<std::int32_t>(std::min( |
| static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::max()), |
| std::max( |
| static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::min()), |
| sum))); |
| } |
| |
| template <typename tRawType, int tIntegerBits> |
| gemmlowp::FixedPoint<tRawType, tIntegerBits> SaturatingAddNonGemmlowp( |
| gemmlowp::FixedPoint<tRawType, tIntegerBits> a, |
| gemmlowp::FixedPoint<tRawType, tIntegerBits> b) { |
| return gemmlowp::FixedPoint<tRawType, tIntegerBits>::FromRaw( |
| SaturatingAddNonGemmlowp(a.raw(), b.raw())); |
| } |
| |
| template <typename IntegerType> |
| IntegerType SaturatingSub(IntegerType a, IntegerType b) { |
| static_assert(std::is_same<IntegerType, void>::value, "unimplemented"); |
| return a; |
| } |
| |
| template <> |
| inline std::int16_t SaturatingSub(std::int16_t a, std::int16_t b) { |
| std::int32_t a32 = a; |
| std::int32_t b32 = b; |
| std::int32_t diff = a32 - b32; |
| return static_cast<std::int16_t>(std::min(32767, std::max(-32768, diff))); |
| } |
| |
| template <> |
| inline std::int32_t SaturatingSub(std::int32_t a, std::int32_t b) { |
| std::int64_t a64 = a; |
| std::int64_t b64 = b; |
| std::int64_t diff = a64 - b64; |
| return static_cast<std::int32_t>(std::min( |
| static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::max()), |
| std::max( |
| static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::min()), |
| diff))); |
| } |
| |
| template <typename tRawType, int tIntegerBits> |
| gemmlowp::FixedPoint<tRawType, tIntegerBits> SaturatingSub( |
| gemmlowp::FixedPoint<tRawType, tIntegerBits> a, |
| gemmlowp::FixedPoint<tRawType, tIntegerBits> b) { |
| return gemmlowp::FixedPoint<tRawType, tIntegerBits>::FromRaw( |
| SaturatingSub(a.raw(), b.raw())); |
| } |
| // End section to be moved to gemmlowp. |
| |
| namespace reference_ops { |
| |
| // TODO(b/80247582) Remove this constant. |
| // This will be phased out as the shifts are revised with more thought. Use of a |
| // constant enables us to track progress on this work. |
| // |
| // Used mainly to convert from old-style shifts (right) to new-style (left). |
| static constexpr int kReverseShift = -1; |
| |
| inline void ShapeFromDims(const tflite::Dims<4>& dims, RuntimeShape* shape) { |
| shape->BuildFrom( |
| {dims.sizes[3], dims.sizes[2], dims.sizes[1], dims.sizes[0]}); |
| } |
| |
| template <typename T> |
| int CountLeadingZeros(T integer_input) { |
| static_assert(std::is_unsigned<T>::value, |
| "Only unsigned integer types handled."); |
| if (integer_input == 0) { |
| return std::numeric_limits<T>::digits; |
| } |
| const T one_in_leading_positive = static_cast<T>(1) |
| << (std::numeric_limits<T>::digits - 1); |
| int leading_zeros = 0; |
| while (integer_input < one_in_leading_positive) { |
| integer_input <<= 1; |
| ++leading_zeros; |
| } |
| return leading_zeros; |
| } |
| |
| template <typename IntegerType> |
| IntegerType SaturatingRoundingMultiplyByPOTParam(IntegerType x, int exponent) { |
| if (exponent == 0) { |
| return x; |
| } |
| using ScalarIntegerType = |
| typename gemmlowp::FixedPointRawTypeTraits<IntegerType>::ScalarRawType; |
| const IntegerType min = |
| gemmlowp::Dup<IntegerType>(std::numeric_limits<ScalarIntegerType>::min()); |
| const IntegerType max = |
| gemmlowp::Dup<IntegerType>(std::numeric_limits<ScalarIntegerType>::max()); |
| const int ScalarIntegerTypeBits = 8 * sizeof(ScalarIntegerType); |
| |
| const std::int32_t threshold = |
| ((1 << (ScalarIntegerTypeBits - 1 - exponent)) - 1); |
| const IntegerType positive_mask = |
| gemmlowp::MaskIfGreaterThan(x, gemmlowp::Dup<IntegerType>(threshold)); |
| const IntegerType negative_mask = |
| gemmlowp::MaskIfLessThan(x, gemmlowp::Dup<IntegerType>(-threshold)); |
| |
| IntegerType result = gemmlowp::ShiftLeft(x, exponent); |
| result = gemmlowp::SelectUsingMask(positive_mask, max, result); |
| result = gemmlowp::SelectUsingMask(negative_mask, min, result); |
| return result; |
| } |
| |
| // If we want to leave IntegerBits fixed, then multiplication |
| // by a power of two has to be saturating/rounding, not exact anymore. |
| template <typename tRawType, int tIntegerBits> |
| gemmlowp::FixedPoint<tRawType, tIntegerBits> |
| SaturatingRoundingMultiplyByPOTParam( |
| gemmlowp::FixedPoint<tRawType, tIntegerBits> a, int exponent) { |
| return gemmlowp::FixedPoint<tRawType, tIntegerBits>::FromRaw( |
| SaturatingRoundingMultiplyByPOTParam(a.raw(), exponent)); |
| } |
| |
| inline void Conv(const ConvParams& params, const RuntimeShape& input_shape, |
| const float* input_data, const RuntimeShape& filter_shape, |
| const float* filter_data, const RuntimeShape& bias_shape, |
| const float* bias_data, const RuntimeShape& output_shape, |
| float* output_data, const RuntimeShape& im2col_shape, |
| float* im2col_data) { |
| const int stride_width = params.stride_width; |
| const int stride_height = params.stride_height; |
| const int dilation_width_factor = params.dilation_width_factor; |
| const int dilation_height_factor = params.dilation_height_factor; |
| const int pad_width = params.padding_values.width; |
| const int pad_height = params.padding_values.height; |
| const float output_activation_min = params.float_activation_min; |
| const float output_activation_max = params.float_activation_max; |
| TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); |
| |
| (void)im2col_data; // only used in optimized code. |
| (void)im2col_shape; // only used in optimized code. |
| const int batches = MatchingDim(input_shape, 0, output_shape, 0); |
| const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3); |
| const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3); |
| if (bias_data) { |
| TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth); |
| } |
| const int input_height = input_shape.Dims(1); |
| const int input_width = input_shape.Dims(2); |
| const int filter_height = filter_shape.Dims(1); |
| const int filter_width = filter_shape.Dims(2); |
| const int output_height = output_shape.Dims(1); |
| const int output_width = output_shape.Dims(2); |
| for (int batch = 0; batch < batches; ++batch) { |
| for (int out_y = 0; out_y < output_height; ++out_y) { |
| for (int out_x = 0; out_x < output_width; ++out_x) { |
| for (int out_channel = 0; out_channel < output_depth; ++out_channel) { |
| const int in_x_origin = (out_x * stride_width) - pad_width; |
| const int in_y_origin = (out_y * stride_height) - pad_height; |
| float total = 0.f; |
| for (int filter_y = 0; filter_y < filter_height; ++filter_y) { |
| for (int filter_x = 0; filter_x < filter_width; ++filter_x) { |
| for (int in_channel = 0; in_channel < input_depth; ++in_channel) { |
| const int in_x = in_x_origin + dilation_width_factor * filter_x; |
| const int in_y = |
| in_y_origin + dilation_height_factor * filter_y; |
| // If the location is outside the bounds of the input image, |
| // use zero as a default value. |
| if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && |
| (in_y < input_height)) { |
| float input_value = input_data[Offset( |
| input_shape, batch, in_y, in_x, in_channel)]; |
| float filter_value = |
| filter_data[Offset(filter_shape, out_channel, filter_y, |
| filter_x, in_channel)]; |
| total += (input_value * filter_value); |
| } |
| } |
| } |
| } |
| float bias_value = 0.0f; |
| if (bias_data) { |
| bias_value = bias_data[out_channel]; |
| } |
| output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] = |
| ActivationFunctionWithMinMax(total + bias_value, |
| output_activation_min, |
| output_activation_max); |
| } |
| } |
| } |
| } |
| } |
| |
| // TODO(b/80418076): Move to legacy ops file, update invocations. |
| // Legacy. |
| inline void Conv(const float* input_data, const Dims<4>& input_dims, |
| const float* filter_data, const Dims<4>& filter_dims, |
| const float* bias_data, const Dims<4>& bias_dims, |
| int stride_width, int stride_height, int dilation_width_factor, |
| int dilation_height_factor, int pad_width, int pad_height, |
| float output_activation_min, float output_activation_max, |
| float* output_data, const Dims<4>& output_dims, |
| float* im2col_data, const Dims<4>& im2col_dims) { |
| tflite::ConvParams op_params; |
| // Padding type is ignored, but still set. |
| op_params.padding_type = PaddingType::kSame; |
| op_params.padding_values.width = pad_width; |
| op_params.padding_values.height = pad_height; |
| op_params.stride_width = stride_width; |
| op_params.stride_height = stride_height; |
| op_params.dilation_width_factor = dilation_width_factor; |
| op_params.dilation_height_factor = dilation_height_factor; |
| op_params.float_activation_min = output_activation_min; |
| op_params.float_activation_max = output_activation_max; |
| |
| Conv(op_params, DimsToShape(input_dims), input_data, DimsToShape(filter_dims), |
| filter_data, DimsToShape(bias_dims), bias_data, DimsToShape(output_dims), |
| output_data, DimsToShape(im2col_dims), im2col_data); |
| } |
| |
| // TODO(b/80418076): Move to legacy ops file, update invocations. |
| // Legacy. |
| template <FusedActivationFunctionType Ac> |
| void Conv(const float* input_data, const Dims<4>& input_dims, |
| const float* filter_data, const Dims<4>& filter_dims, |
| const float* bias_data, const Dims<4>& bias_dims, int stride_width, |
| int stride_height, int dilation_width_factor, |
| int dilation_height_factor, int pad_width, int pad_height, |
| float* output_data, const Dims<4>& output_dims, float* im2col_data, |
| const Dims<4>& im2col_dims) { |
| float output_activation_min, output_activation_max; |
| GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); |
| Conv(input_data, input_dims, filter_data, filter_dims, bias_data, bias_dims, |
| stride_width, stride_height, dilation_width_factor, |
| dilation_height_factor, pad_width, pad_height, output_activation_min, |
| output_activation_max, output_data, output_dims, im2col_data, |
| im2col_dims); |
| } |
| |
| // TODO(b/80418076): Move to legacy ops file, update invocations. |
| // legacy, for compatibility with old checked-in code |
| template <FusedActivationFunctionType Ac> |
| void Conv(const float* input_data, const Dims<4>& input_dims, |
| const float* filter_data, const Dims<4>& filter_dims, |
| const float* bias_data, const Dims<4>& bias_dims, int stride_width, |
| int stride_height, int pad_width, int pad_height, float* output_data, |
| const Dims<4>& output_dims, float* im2col_data, |
| const Dims<4>& im2col_dims) { |
| float output_activation_min, output_activation_max; |
| GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); |
| Conv(input_data, input_dims, filter_data, filter_dims, bias_data, bias_dims, |
| stride_width, stride_height, 1, 1, pad_width, pad_height, |
| output_activation_min, output_activation_max, output_data, output_dims, |
| im2col_data, im2col_dims); |
| } |
| |
| // TODO(b/80418076): Move to legacy ops file, update invocations. |
| // legacy, for compatibility with old checked-in code |
| template <FusedActivationFunctionType Ac> |
| void Conv(const float* input_data, const Dims<4>& input_dims, |
| const float* filter_data, const Dims<4>& filter_dims, |
| const float* bias_data, const Dims<4>& bias_dims, int stride, |
| int pad_width, int pad_height, float* output_data, |
| const Dims<4>& output_dims, float* im2col_data, |
| const Dims<4>& im2col_dims) { |
| Conv<Ac>(input_data, input_dims, filter_data, filter_dims, bias_data, |
| bias_dims, stride, stride, 1, 1, pad_width, pad_height, output_data, |
| output_dims, im2col_data, im2col_dims); |
| } |
| |
| inline void Conv(const ConvParams& params, const RuntimeShape& input_shape, |
| const uint8* input_data, const RuntimeShape& filter_shape, |
| const uint8* filter_data, const RuntimeShape& bias_shape, |
| const int32* bias_data, const RuntimeShape& output_shape, |
| uint8* output_data, const RuntimeShape& im2col_shape, |
| uint8* im2col_data, gemmlowp::GemmContext* gemm_context) { |
| (void)im2col_data; // only used in optimized code. |
| (void)im2col_shape; // only used in optimized code. |
| (void)gemm_context; // only used in optimized code. |
| const int stride_width = params.stride_width; |
| const int stride_height = params.stride_height; |
| const int dilation_width_factor = params.dilation_width_factor; |
| const int dilation_height_factor = params.dilation_height_factor; |
| const int pad_width = params.padding_values.width; |
| const int pad_height = params.padding_values.height; |
| const int32 input_offset = params.input_offset; |
| const int32 filter_offset = params.weights_offset; |
| const int32 output_offset = params.output_offset; |
| const int32 output_multiplier = params.output_multiplier; |
| const int output_shift = params.output_shift; |
| const int32 output_activation_min = params.quantized_activation_min; |
| const int32 output_activation_max = params.quantized_activation_max; |
| TFLITE_DCHECK_LE(output_activation_min, output_activation_max); |
| |
| TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); |
| const int batches = MatchingDim(input_shape, 0, output_shape, 0); |
| const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3); |
| const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3); |
| if (bias_data) { |
| TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth); |
| } |
| const int input_height = input_shape.Dims(1); |
| const int input_width = input_shape.Dims(2); |
| const int filter_height = filter_shape.Dims(1); |
| const int filter_width = filter_shape.Dims(2); |
| const int output_height = output_shape.Dims(1); |
| const int output_width = output_shape.Dims(2); |
| for (int batch = 0; batch < batches; ++batch) { |
| for (int out_y = 0; out_y < output_height; ++out_y) { |
| for (int out_x = 0; out_x < output_width; ++out_x) { |
| for (int out_channel = 0; out_channel < output_depth; ++out_channel) { |
| const int in_x_origin = (out_x * stride_width) - pad_width; |
| const int in_y_origin = (out_y * stride_height) - pad_height; |
| int32 acc = 0; |
| for (int filter_y = 0; filter_y < filter_height; ++filter_y) { |
| for (int filter_x = 0; filter_x < filter_width; ++filter_x) { |
| for (int in_channel = 0; in_channel < input_depth; ++in_channel) { |
| const int in_x = in_x_origin + dilation_width_factor * filter_x; |
| const int in_y = |
| in_y_origin + dilation_height_factor * filter_y; |
| // If the location is outside the bounds of the input image, |
| // use zero as a default value. |
| if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && |
| (in_y < input_height)) { |
| int32 input_val = input_data[Offset(input_shape, batch, in_y, |
| in_x, in_channel)]; |
| int32 filter_val = |
| filter_data[Offset(filter_shape, out_channel, filter_y, |
| filter_x, in_channel)]; |
| acc += |
| (filter_val + filter_offset) * (input_val + input_offset); |
| } |
| } |
| } |
| } |
| if (bias_data) { |
| acc += bias_data[out_channel]; |
| } |
| acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, |
| output_shift); |
| acc += output_offset; |
| acc = std::max(acc, output_activation_min); |
| acc = std::min(acc, output_activation_max); |
| output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] = |
| static_cast<uint8>(acc); |
| } |
| } |
| } |
| } |
| } |
| |
| // TODO(b/80418076): Move to legacy ops file, update invocations. |
| // Legacy. |
| inline void Conv(const uint8* input_data, const Dims<4>& input_dims, |
| int32 input_offset, const uint8* filter_data, |
| const Dims<4>& filter_dims, int32 filter_offset, |
| const int32* bias_data, const Dims<4>& bias_dims, |
| int stride_width, int stride_height, int dilation_width_factor, |
| int dilation_height_factor, int pad_width, int pad_height, |
| int32 output_offset, int32 output_multiplier, int output_shift, |
| int32 output_activation_min, int32 output_activation_max, |
| uint8* output_data, const Dims<4>& output_dims, |
| uint8* im2col_data, const Dims<4>& im2col_dims, |
| gemmlowp::GemmContext* gemm_context) { |
| tflite::ConvParams op_params; |
| // Padding type is ignored, but still set. |
| op_params.padding_type = PaddingType::kSame; |
| op_params.padding_values.width = pad_width; |
| op_params.padding_values.height = pad_height; |
| op_params.stride_width = stride_width; |
| op_params.stride_height = stride_height; |
| op_params.dilation_width_factor = dilation_width_factor; |
| op_params.dilation_height_factor = dilation_height_factor; |
| op_params.input_offset = input_offset; |
| op_params.weights_offset = filter_offset; |
| op_params.output_offset = output_offset; |
| op_params.output_multiplier = output_multiplier; |
| // Legacy ops used mixed left and right shifts. Now all are +ve-means-left. |
| op_params.output_shift = kReverseShift * output_shift; |
| op_params.quantized_activation_min = output_activation_min; |
| op_params.quantized_activation_max = output_activation_max; |
| |
| Conv(op_params, DimsToShape(input_dims), input_data, DimsToShape(filter_dims), |
| filter_data, DimsToShape(bias_dims), bias_data, DimsToShape(output_dims), |
| output_data, DimsToShape(im2col_dims), im2col_data, gemm_context); |
| } |
| |
| // TODO(b/80418076): Move to legacy ops file, update invocations. |
| // Legacy. |
| inline void Conv(const uint8* input_data, const Dims<4>& input_dims, |
| int32 input_offset, const uint8* filter_data, |
| const Dims<4>& filter_dims, int32 filter_offset, |
| const int32* bias_data, const Dims<4>& bias_dims, |
| int stride_width, int stride_height, int pad_width, |
| int pad_height, int32 output_offset, int32 output_multiplier, |
| int output_shift, int32 output_activation_min, |
| int32 output_activation_max, uint8* output_data, |
| const Dims<4>& output_dims, uint8* im2col_data, |
| const Dims<4>& im2col_dims, |
| gemmlowp::GemmContext* gemm_context) { |
| Conv(input_data, input_dims, input_offset, filter_data, filter_dims, |
| filter_offset, bias_data, bias_dims, stride_width, stride_height, 1, 1, |
| pad_width, pad_height, output_offset, output_multiplier, output_shift, |
| output_activation_min, output_activation_max, output_data, output_dims, |
| im2col_data, im2col_dims, gemm_context); |
| } |
| |
| // TODO(b/80418076): Move to legacy ops file, update invocations. |
| // legacy, for compatibility with old checked-in code |
| template <FusedActivationFunctionType Ac> |
| inline void Conv(const uint8* input_data, const Dims<4>& input_dims, |
| int32 input_offset, const uint8* filter_data, |
| const Dims<4>& filter_dims, int32 filter_offset, |
| const int32* bias_data, const Dims<4>& bias_dims, |
| int stride_width, int stride_height, int pad_width, |
| int pad_height, int32 output_offset, int32 output_multiplier, |
| int output_shift, int32 output_activation_min, |
| int32 output_activation_max, uint8* output_data, |
| const Dims<4>& output_dims, uint8* im2col_data, |
| const Dims<4>& im2col_dims, |
| gemmlowp::GemmContext* gemm_context) { |
| static_assert(Ac == FusedActivationFunctionType::kNone || |
| Ac == FusedActivationFunctionType::kRelu || |
| Ac == FusedActivationFunctionType::kRelu6 || |
| Ac == FusedActivationFunctionType::kRelu1, |
| ""); |
| if (Ac == FusedActivationFunctionType::kNone) { |
| TFLITE_DCHECK_EQ(output_activation_min, 0); |
| TFLITE_DCHECK_EQ(output_activation_max, 255); |
| } |
| Conv(input_data, input_dims, input_offset, filter_data, filter_dims, |
| filter_offset, bias_data, bias_dims, stride_width, stride_height, |
| pad_width, pad_height, output_offset, output_multiplier, output_shift, |
| output_activation_min, output_activation_max, output_data, output_dims, |
| im2col_data, im2col_dims, gemm_context); |
| } |
| |
| // TODO(b/80418076): Move to legacy ops file, update invocations. |
| // legacy, for compatibility with old checked-in code |
| template <FusedActivationFunctionType Ac> |
| void Conv(const uint8* input_data, const Dims<4>& input_dims, |
| int32 input_offset, const uint8* filter_data, |
| const Dims<4>& filter_dims, int32 filter_offset, |
| const int32* bias_data, const Dims<4>& bias_dims, int stride, |
| int pad_width, int pad_height, int32 output_offset, |
| int32 output_multiplier, int output_shift, |
| int32 output_activation_min, int32 output_activation_max, |
| uint8* output_data, const Dims<4>& output_dims, uint8* im2col_data, |
| const Dims<4>& im2col_dims, gemmlowp::GemmContext* gemm_context) { |
| Conv<Ac>(input_data, input_dims, input_offset, filter_data, filter_dims, |
| filter_offset, bias_data, bias_dims, stride, stride, pad_width, |
| pad_height, output_offset, output_multiplier, output_shift, |
| output_activation_min, output_activation_max, output_data, |
| output_dims, im2col_data, im2col_dims, gemm_context); |
| } |
| |
| template <typename T> |
| inline void DepthToSpace(const tflite::DepthToSpaceParams& op_params, |
| const RuntimeShape& unextended_input_shape, |
| const T* input_data, |
| const RuntimeShape& unextended_output_shape, |
| T* output_data) { |
| TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); |
| const RuntimeShape input_shape = |
| RuntimeShape::ExtendedShape(4, unextended_input_shape); |
| const RuntimeShape output_shape = |
| RuntimeShape::ExtendedShape(4, unextended_output_shape); |
| |
| const int input_depth = input_shape.Dims(3); |
| const int input_width = input_shape.Dims(2); |
| const int input_height = input_shape.Dims(1); |
| const int input_batch = input_shape.Dims(0); |
| |
| const int output_depth = output_shape.Dims(3); |
| const int output_width = output_shape.Dims(2); |
| const int output_height = output_shape.Dims(1); |
| const int output_batch = output_shape.Dims(0); |
| |
| const int32 block_size = op_params.block_size; |
| |
| TFLITE_DCHECK_EQ(input_width * block_size, output_width); |
| TFLITE_DCHECK_EQ(input_height * block_size, output_height); |
| TFLITE_DCHECK_EQ(input_depth, output_depth * block_size * block_size); |
| TFLITE_DCHECK_EQ(input_batch, output_batch); |
| |
| for (int out_b = 0; out_b < output_batch; ++out_b) { |
| for (int out_h = 0; out_h < output_height; ++out_h) { |
| for (int out_w = 0; out_w < output_width; ++out_w) { |
| for (int out_d = 0; out_d < output_depth; ++out_d) { |
| const int in_d = |
| out_d + ((out_h % block_size) * block_size + out_w % block_size) * |
| output_depth; |
| |
| const int in_w = out_w / block_size; |
| const int in_h = out_h / block_size; |
| const int in_b = out_b; |
| |
| const int input_index = Offset(input_shape, in_b, in_h, in_w, in_d); |
| const int output_index = |
| Offset(output_shape, out_b, out_h, out_w, out_d); |
| |
| output_data[output_index] = input_data[input_index]; |
| } |
| } |
| } |
| } |
| } |
| |
| template <typename T> |
| inline void SpaceToDepth(const tflite::SpaceToDepthParams& op_params, |
| const RuntimeShape& unextended_input_shape, |
| const T* input_data, |
| const RuntimeShape& unextended_output_shape, |
| T* output_data) { |
| TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); |
| const RuntimeShape input_shape = |
| RuntimeShape::ExtendedShape(4, unextended_input_shape); |
| const RuntimeShape output_shape = |
| RuntimeShape::ExtendedShape(4, unextended_output_shape); |
| |
| const int input_depth = input_shape.Dims(3); |
| const int input_width = input_shape.Dims(2); |
| const int input_height = input_shape.Dims(1); |
| const int input_batch = input_shape.Dims(0); |
| |
| const int output_depth = output_shape.Dims(3); |
| const int output_width = output_shape.Dims(2); |
| const int output_height = output_shape.Dims(1); |
| const int output_batch = output_shape.Dims(0); |
| |
| const int32 block_size = op_params.block_size; |
| |
| TFLITE_DCHECK_EQ(input_width, output_width * block_size); |
| TFLITE_DCHECK_EQ(input_height, output_height * block_size); |
| TFLITE_DCHECK_EQ(input_depth * block_size * block_size, output_depth); |
| TFLITE_DCHECK_EQ(input_batch, output_batch); |
| |
| for (int in_b = 0; in_b < input_batch; ++in_b) { |
| for (int in_h = 0; in_h < input_height; ++in_h) { |
| for (int in_w = 0; in_w < input_width; ++in_w) { |
| for (int in_d = 0; in_d < input_depth; ++in_d) { |
| const int out_d = |
| in_d + ((in_h % block_size) * block_size + in_w % block_size) * |
| input_depth; |
| const int out_w = in_w / block_size; |
| const int out_h = in_h / block_size; |
| const int out_b = in_b; |
| |
| const int input_index = Offset(input_shape, in_b, in_h, in_w, in_d); |
| const int output_index = |
| Offset(output_shape, out_b, out_h, out_w, out_d); |
| |
| output_data[output_index] = input_data[input_index]; |
| } |
| } |
| } |
| } |
| } |
| |
| inline void FullyConnected( |
| const FullyConnectedParams& params, const RuntimeShape& input_shape, |
| const float* input_data, const RuntimeShape& weights_shape, |
| const float* weights_data, const RuntimeShape& bias_shape, |
| const float* bias_data, const RuntimeShape& output_shape, |
| float* output_data) { |
| const float output_activation_min = params.float_activation_min; |
| const float output_activation_max = params.float_activation_max; |
| // TODO(benoitjacob): This really should be: |
| // const int batches = ArraySize(output_dims, 1); |
| // but the current --variable_batch hack consists in overwriting the 3rd |
| // dimension with the runtime batch size, as we don't keep track for each |
| // array of which dimension is the batch dimension in it. |
| const int output_dims_count = output_shape.DimensionsCount(); |
| const int weights_dims_count = weights_shape.DimensionsCount(); |
| const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1); |
| const int output_depth = MatchingDim(weights_shape, weights_dims_count - 2, |
| output_shape, output_dims_count - 1); |
| const int accum_depth = weights_shape.Dims(weights_dims_count - 1); |
| for (int b = 0; b < batches; ++b) { |
| for (int out_c = 0; out_c < output_depth; ++out_c) { |
| float total = 0.f; |
| for (int d = 0; d < accum_depth; ++d) { |
| total += input_data[b * accum_depth + d] * |
| weights_data[out_c * accum_depth + d]; |
| } |
| float bias_value = 0.0f; |
| if (bias_data) { |
| bias_value = bias_data[out_c]; |
| } |
| output_data[out_c + output_depth * b] = ActivationFunctionWithMinMax( |
| total + bias_value, output_activation_min, output_activation_max); |
| } |
| } |
| } |
| |
| // TODO(b/80418076): Move to legacy ops file, update invocations. |
| // Legacy. |
| inline void FullyConnected(const float* input_data, const Dims<4>& input_dims, |
| const float* weights_data, |
| const Dims<4>& weights_dims, const float* bias_data, |
| const Dims<4>& bias_dims, |
| float output_activation_min, |
| float output_activation_max, float* output_data, |
| const Dims<4>& output_dims) { |
| tflite::FullyConnectedParams op_params; |
| op_params.float_activation_min = output_activation_min; |
| op_params.float_activation_max = output_activation_max; |
| |
| FullyConnected(op_params, DimsToShape(input_dims), input_data, |
| DimsToShape(weights_dims), weights_data, |
| DimsToShape(bias_dims), bias_data, DimsToShape(output_dims), |
| output_data); |
| } |
| |
| // TODO(b/80418076): Move to legacy ops file, update invocations. |
| // legacy, for compatibility with old checked-in code |
| template <FusedActivationFunctionType Ac> |
| void FullyConnected(const float* input_data, const Dims<4>& input_dims, |
| const float* weights_data, const Dims<4>& weights_dims, |
| const float* bias_data, const Dims<4>& bias_dims, |
| float* output_data, const Dims<4>& output_dims) { |
| float output_activation_min, output_activation_max; |
| GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); |
| FullyConnected(input_data, input_dims, weights_data, weights_dims, bias_data, |
| bias_dims, output_activation_min, output_activation_max, |
| output_data, output_dims); |
| } |
| |
| inline void FullyConnected( |
| const FullyConnectedParams& params, const RuntimeShape& input_shape, |
| const uint8* input_data, const RuntimeShape& filter_shape, |
| const uint8* filter_data, const RuntimeShape& bias_shape, |
| const int32* bias_data, const RuntimeShape& output_shape, |
| uint8* output_data, gemmlowp::GemmContext* gemm_context) { |
| (void)gemm_context; // only used in optimized code. |
| const int32 input_offset = params.input_offset; |
| const int32 filter_offset = params.weights_offset; |
| const int32 output_offset = params.output_offset; |
| const int32 output_multiplier = params.output_multiplier; |
| const int output_shift = params.output_shift; |
| const int32 output_activation_min = params.quantized_activation_min; |
| const int32 output_activation_max = params.quantized_activation_max; |
| TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2); |
| TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1); |
| |
| TFLITE_DCHECK_LE(output_activation_min, output_activation_max); |
| // TODO(benoitjacob): This really should be: |
| // const int batches = ArraySize(output_dims, 1); |
| // but the current --variable_batch hack consists in overwriting the 3rd |
| // dimension with the runtime batch size, as we don't keep track for each |
| // array of which dimension is the batch dimension in it. |
| const int output_dim_count = output_shape.DimensionsCount(); |
| const int filter_dim_count = filter_shape.DimensionsCount(); |
| const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1); |
| const int output_depth = MatchingDim(filter_shape, filter_dim_count - 2, |
| output_shape, output_dim_count - 1); |
| const int accum_depth = filter_shape.Dims(filter_dim_count - 1); |
| for (int b = 0; b < batches; ++b) { |
| for (int out_c = 0; out_c < output_depth; ++out_c) { |
| int32 acc = 0; |
| for (int d = 0; d < accum_depth; ++d) { |
| int32 input_val = input_data[b * accum_depth + d]; |
| int32 filter_val = filter_data[out_c * accum_depth + d]; |
| acc += (filter_val + filter_offset) * (input_val + input_offset); |
| } |
| if (bias_data) { |
| acc += bias_data[out_c]; |
| } |
| acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift); |
| acc += output_offset; |
| acc = std::max(acc, output_activation_min); |
| acc = std::min(acc, output_activation_max); |
| output_data[out_c + output_depth * b] = static_cast<uint8>(acc); |
| } |
| } |
| } |
| |
| // TODO(b/80418076): Move to legacy ops file, update invocations. |
| // Legacy. |
| inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims, |
| int32 input_offset, const uint8* filter_data, |
| const Dims<4>& filter_dims, int32 filter_offset, |
| const int32* bias_data, const Dims<4>& bias_dims, |
| int32 output_offset, int32 output_multiplier, |
| int output_shift, int32 output_activation_min, |
| int32 output_activation_max, uint8* output_data, |
| const Dims<4>& output_dims, |
| gemmlowp::GemmContext* gemm_context) { |
| tflite::FullyConnectedParams op_params; |
| op_params.input_offset = input_offset; |
| op_params.weights_offset = filter_offset; |
| op_params.output_offset = output_offset; |
| op_params.output_multiplier = output_multiplier; |
| // Legacy ops used mixed left and right shifts. Now all are +ve-means-left. |
| op_params.output_shift = kReverseShift * output_shift; |
| op_params.quantized_activation_min = output_activation_min; |
| op_params.quantized_activation_max = output_activation_max; |
| |
| FullyConnected(op_params, DimsToShape(input_dims), input_data, |
| DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims), |
| bias_data, DimsToShape(output_dims), output_data, |
| gemm_context); |
| } |
| |
| inline void FullyConnected( |
| const FullyConnectedParams& params, const RuntimeShape& input_shape, |
| const uint8* input_data, const RuntimeShape& filter_shape, |
| const uint8* filter_data, const RuntimeShape& bias_shape, |
| const int32* bias_data, const RuntimeShape& output_shape, |
| int16* output_data, gemmlowp::GemmContext* gemm_context) { |
| (void)gemm_context; // only used in optimized code. |
| const int32 input_offset = params.input_offset; |
| const int32 filter_offset = params.weights_offset; |
| const int32 output_offset = params.output_offset; |
| const int32 output_multiplier = params.output_multiplier; |
| const int output_shift = params.output_shift; |
| const int32 output_activation_min = params.quantized_activation_min; |
| const int32 output_activation_max = params.quantized_activation_max; |
| |
| TFLITE_DCHECK_LE(output_activation_min, output_activation_max); |
| TFLITE_DCHECK_EQ(output_offset, 0); |
| // TODO(benoitjacob): This really should be: |
| // const int batches = ArraySize(output_dims, 1); |
| // but the current --variable_batch hack consists in overwriting the 3rd |
| // dimension with the runtime batch size, as we don't keep track for each |
| // array of which dimension is the batch dimension in it. |
| const int output_dim_count = output_shape.DimensionsCount(); |
| const int filter_dim_count = filter_shape.DimensionsCount(); |
| const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1); |
| const int output_depth = MatchingDim(filter_shape, filter_dim_count - 2, |
| output_shape, output_dim_count - 1); |
| const int accum_depth = filter_shape.Dims(filter_dim_count - 1); |
| for (int b = 0; b < batches; ++b) { |
| for (int out_c = 0; out_c < output_depth; ++out_c) { |
| // Internal accumulation. |
| // Initialize accumulator with the bias-value. |
| int32 accum = bias_data[out_c]; |
| // Accumulation loop. |
| for (int d = 0; d < accum_depth; ++d) { |
| int16 input_val = input_data[b * accum_depth + d] + input_offset; |
| int16 filter_val = filter_data[out_c * accum_depth + d] + filter_offset; |
| accum += filter_val * input_val; |
| } |
| // Down-scale the final int32 accumulator to the scale used by our |
| // (16-bit, typically 3 integer bits) fixed-point format. The quantized |
| // multiplier and shift here have been pre-computed offline |
| // (e.g. by toco). |
| accum = |
| MultiplyByQuantizedMultiplier(accum, output_multiplier, output_shift); |
| // Saturate, cast to int16, and store to output array. |
| accum = std::max(accum, output_activation_min - output_offset); |
| accum = std::min(accum, output_activation_max - output_offset); |
| accum += output_offset; |
| output_data[out_c + output_depth * b] = accum; |
| } |
| } |
| } |
| |
| // TODO(b/80418076): Move to legacy ops file, update invocations. |
| // Legacy. |
| inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims, |
| int32 input_offset, const uint8* filter_data, |
| const Dims<4>& filter_dims, int32 filter_offset, |
| const int32* bias_data, const Dims<4>& bias_dims, |
| int32 output_offset, int32 output_multiplier, |
| int output_shift, int32 output_activation_min, |
| int32 output_activation_max, int16* output_data, |
| const Dims<4>& output_dims, |
| gemmlowp::GemmContext* gemm_context) { |
| tflite::FullyConnectedParams op_params; |
| op_params.input_offset = input_offset; |
| op_params.weights_offset = filter_offset; |
| op_params.output_offset = output_offset; |
| op_params.output_multiplier = output_multiplier; |
| // Legacy ops used mixed left and right shifts. Now all are +ve-means-left. |
| op_params.output_shift = kReverseShift * output_shift; |
| op_params.quantized_activation_min = output_activation_min; |
| op_params.quantized_activation_max = output_activation_max; |
| |
| FullyConnected(op_params, DimsToShape(input_dims), input_data, |
| DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims), |
| bias_data, DimsToShape(output_dims), output_data, |
| gemm_context); |
| } |
| |
| inline void ShuffledFullyConnected( |
| const FullyConnectedParams& params, const RuntimeShape& input_shape, |
| const uint8* input_data, const RuntimeShape& weights_shape, |
| const uint8* shuffled_weights_data, const RuntimeShape& bias_shape, |
| const int32* bias_data, const RuntimeShape& output_shape, |
| int16* output_data, uint8* shuffled_input_workspace_data, |
| gemmlowp::GemmContext* gemm_context) { |
| (void)gemm_context; // only used in optimized code. |
| const int32 output_multiplier = params.output_multiplier; |
| const int output_shift = params.output_shift; |
| const int32 output_activation_min = params.quantized_activation_min; |
| const int32 output_activation_max = params.quantized_activation_max; |
| TFLITE_DCHECK_LE(output_activation_min, output_activation_max); |
| |
| TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1); |
| TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2); |
| TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1); |
| // TODO(benoitjacob): This really should be: |
| // const int batches = ArraySize(output_dims, 1); |
| // but the current --variable_batch hack consists in overwriting the 3rd |
| // dimension with the runtime batch size, as we don't keep track for each |
| // array of which dimension is the batch dimension in it. |
| const int output_dim_count = output_shape.DimensionsCount(); |
| const int weights_dim_count = weights_shape.DimensionsCount(); |
| const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1); |
| const int output_depth = MatchingDim(weights_shape, weights_dim_count - 2, |
| output_shape, output_dim_count - 1); |
| const int accum_depth = weights_shape.Dims(weights_dim_count - 1); |
| TFLITE_DCHECK((accum_depth % 16) == 0); |
| TFLITE_DCHECK((output_depth % 4) == 0); |
| |
| // Shuffling and xoring of input activations into the workspace buffer |
| uint8* shuffled_input_workspace_ptr = shuffled_input_workspace_data; |
| if (batches == 1) { |
| for (int i = 0; i < accum_depth; i++) { |
| shuffled_input_workspace_data[i] = input_data[i] ^ 0x80; |
| } |
| } else if (batches == 4) { |
| for (int c = 0; c < accum_depth; c += 16) { |
| for (int b = 0; b < 4; b++) { |
| const uint8* src_data_ptr = input_data + b * accum_depth + c; |
| for (int j = 0; j < 16; j++) { |
| uint8 src_val = *src_data_ptr++; |
| // Flip the sign bit, so that the kernel will only need to |
| // reinterpret these uint8 values as int8, getting for free the |
| // subtraction of the zero_point value 128. |
| uint8 dst_val = src_val ^ 0x80; |
| *shuffled_input_workspace_ptr++ = dst_val; |
| } |
| } |
| } |
| } else { |
| TFLITE_DCHECK(false); |
| return; |
| } |
| |
| // Actual computation |
| if (batches == 1) { |
| int16* output_ptr = output_data; |
| // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd) |
| // so that just reinterpreting them as int8 values is equivalent to |
| // subtracting 128 from them, thus implementing for free the subtraction of |
| // the zero_point value 128. |
| const int8* shuffled_weights_ptr = |
| reinterpret_cast<const int8*>(shuffled_weights_data); |
| // Likewise, we preshuffled and pre-xored the input data above. |
| const int8* shuffled_input_data = |
| reinterpret_cast<const int8*>(shuffled_input_workspace_data); |
| for (int c = 0; c < output_depth; c += 4) { |
| // Internal accumulation. |
| // Initialize accumulator with the bias-value. |
| int32 accum[4] = {0}; |
| // Accumulation loop. |
| for (int d = 0; d < accum_depth; d += 16) { |
| for (int i = 0; i < 4; i++) { |
| for (int j = 0; j < 16; j++) { |
| int8 input_val = shuffled_input_data[d + j]; |
| int8 weights_val = *shuffled_weights_ptr++; |
| accum[i] += weights_val * input_val; |
| } |
| } |
| } |
| for (int i = 0; i < 4; i++) { |
| // Add bias value |
| int acc = accum[i] + bias_data[c + i]; |
| // Down-scale the final int32 accumulator to the scale used by our |
| // (16-bit, typically 3 integer bits) fixed-point format. The quantized |
| // multiplier and shift here have been pre-computed offline |
| // (e.g. by toco). |
| acc = |
| MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift); |
| // Saturate, cast to int16, and store to output array. |
| acc = std::max(acc, output_activation_min); |
| acc = std::min(acc, output_activation_max); |
| output_ptr[c + i] = acc; |
| } |
| } |
| } else if (batches == 4) { |
| int16* output_ptr = output_data; |
| // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd) |
| // so that just reinterpreting them as int8 values is equivalent to |
| // subtracting 128 from them, thus implementing for free the subtraction of |
| // the zero_point value 128. |
| const int8* shuffled_weights_ptr = |
| reinterpret_cast<const int8*>(shuffled_weights_data); |
| // Likewise, we preshuffled and pre-xored the input data above. |
| const int8* shuffled_input_data = |
| reinterpret_cast<const int8*>(shuffled_input_workspace_data); |
| for (int c = 0; c < output_depth; c += 4) { |
| const int8* shuffled_input_ptr = shuffled_input_data; |
| // Accumulation loop. |
| // Internal accumulation. |
| // Initialize accumulator with the bias-value. |
| int32 accum[4][4]; |
| for (int i = 0; i < 4; i++) { |
| for (int b = 0; b < 4; b++) { |
| accum[i][b] = 0; |
| } |
| } |
| for (int d = 0; d < accum_depth; d += 16) { |
| for (int i = 0; i < 4; i++) { |
| for (int b = 0; b < 4; b++) { |
| for (int j = 0; j < 16; j++) { |
| int8 input_val = shuffled_input_ptr[16 * b + j]; |
| int8 weights_val = shuffled_weights_ptr[16 * i + j]; |
| accum[i][b] += weights_val * input_val; |
| } |
| } |
| } |
| shuffled_input_ptr += 64; |
| shuffled_weights_ptr += 64; |
| } |
| for (int i = 0; i < 4; i++) { |
| for (int b = 0; b < 4; b++) { |
| // Add bias value |
| int acc = accum[i][b] + bias_data[c + i]; |
| // Down-scale the final int32 accumulator to the scale used by our |
| // (16-bit, typically 3 integer bits) fixed-point format. The |
| // quantized multiplier and shift here have been pre-computed offline |
| // (e.g. by toco). |
| acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, |
| output_shift); |
| // Saturate, cast to int16, and store to output array. |
| acc = std::max(acc, output_activation_min); |
| acc = std::min(acc, output_activation_max); |
| output_ptr[b * output_depth + c + i] = acc; |
| } |
| } |
| } |
| } else { |
| TFLITE_DCHECK(false); |
| return; |
| } |
| } |
| |
| // TODO(b/80418076): Move to legacy ops file, update invocations. |
| // Legacy. |
| inline void ShuffledFullyConnected( |
| const uint8* input_data, const Dims<4>& input_dims, |
| const uint8* shuffled_weights_data, const Dims<4>& weights_dims, |
| const int32* bias_data, const Dims<4>& bias_dims, int32 output_multiplier, |
| int output_shift, int32 output_activation_min, int32 output_activation_max, |
| int16* output_data, const Dims<4>& output_dims, |
| uint8* shuffled_input_workspace_data, gemmlowp::GemmContext* gemm_context) { |
| tflite::FullyConnectedParams op_params; |
| op_params.output_multiplier = output_multiplier; |
| // Legacy ops used mixed left and right shifts. Now all are +ve-means-left. |
| op_params.output_shift = kReverseShift * output_shift; |
| op_params.quantized_activation_min = output_activation_min; |
| op_params.quantized_activation_max = output_activation_max; |
| |
| ShuffledFullyConnected(op_params, DimsToShape(input_dims), input_data, |
| DimsToShape(weights_dims), shuffled_weights_data, |
| DimsToShape(bias_dims), bias_data, |
| DimsToShape(output_dims), output_data, |
| shuffled_input_workspace_data, gemm_context); |
| } |
| |
| // TODO(b/80418076): Move to legacy ops file, update invocations. |
| // legacy, for compatibility with old checked-in code |
| template <FusedActivationFunctionType Ac> |
| void FullyConnected(const uint8* input_data, const Dims<4>& input_dims, |
| int32 input_offset, const uint8* filter_data, |
| const Dims<4>& filter_dims, int32 filter_offset, |
| const int32* bias_data, const Dims<4>& bias_dims, |
| int32 output_offset, int32 output_multiplier, |
| int output_shift, int32 output_activation_min, |
| int32 output_activation_max, uint8* output_data, |
| const Dims<4>& output_dims, |
| gemmlowp::GemmContext* gemm_context) { |
| static_assert(Ac == FusedActivationFunctionType::kNone || |
| Ac == FusedActivationFunctionType::kRelu || |
| Ac == FusedActivationFunctionType::kRelu6 || |
| Ac == FusedActivationFunctionType::kRelu1, |
| ""); |
| if (Ac == FusedActivationFunctionType::kNone) { |
| TFLITE_DCHECK_EQ(output_activation_min, 0); |
| TFLITE_DCHECK_EQ(output_activation_max, 255); |
| } |
| FullyConnected(input_data, input_dims, input_offset, filter_data, filter_dims, |
| filter_offset, bias_data, bias_dims, output_offset, |
| output_multiplier, output_shift, output_activation_min, |
| output_activation_max, output_data, output_dims, gemm_context); |
| } |
| |
| inline void Relu(const RuntimeShape& input_shape, const float* input_data, |
| const RuntimeShape& output_shape, float* output_data) { |
| const int flat_size = MatchingFlatSize(input_shape, output_shape); |
| for (int i = 0; i < flat_size; ++i) { |
| const float val = input_data[i]; |
| const float lower = 0; |
| const float clamped = val < lower ? lower : val; |
| output_data[i] = clamped; |
| } |
| } |
| |
| inline void Relu1(const RuntimeShape& input_shape, const float* input_data, |
| const RuntimeShape& output_shape, float* output_data) { |
| gemmlowp::ScopedProfilingLabel label("Relu1 (not fused)"); |
| const int flat_size = MatchingFlatSize(input_shape, output_shape); |
| for (int i = 0; i < flat_size; ++i) { |
| const float val = input_data[i]; |
| const float upper = 1; |
| const float lower = -1; |
| const float clamped = val > upper ? upper : val < lower ? lower : val; |
| output_data[i] = clamped; |
| } |
| } |
| |
| inline void Relu6(const RuntimeShape& input_shape, const float* input_data, |
| const RuntimeShape& output_shape, float* output_data) { |
| gemmlowp::ScopedProfilingLabel label("Relu6 (not fused)"); |
| const int flat_size = MatchingFlatSize(input_shape, output_shape); |
| for (int i = 0; i < flat_size; ++i) { |
| const float val = input_data[i]; |
| const float upper = 6; |
| const float lower = 0; |
| const float clamped = val > upper ? upper : val < lower ? lower : val; |
| output_data[i] = clamped; |
| } |
| } |
| |
| inline void ReluX(const tflite::ActivationParams& params, |
| const RuntimeShape& input_shape, const uint8* input_data, |
| const RuntimeShape& output_shape, uint8* output_data) { |
| gemmlowp::ScopedProfilingLabel label("Quantized ReluX (not fused)"); |
| const int flat_size = MatchingFlatSize(input_shape, output_shape); |
| const uint8 max_value = params.quantized_activation_max; |
| const uint8 min_value = params.quantized_activation_min; |
| for (int i = 0; i < flat_size; ++i) { |
| const uint8 val = input_data[i]; |
| const uint8 clamped = |
| val > max_value ? max_value : val < min_value ? min_value : val; |
| output_data[i] = clamped; |
| } |
| } |
| |
| inline void L2Normalization(const tflite::L2NormalizationParams& op_params, |
| const RuntimeShape& input_shape, |
| const float* input_data, |
| const RuntimeShape& output_shape, |
| float* output_data) { |
| const int trailing_dim = input_shape.DimensionsCount() - 1; |
| const int outer_size = |
| MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); |
| const int depth = |
| MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); |
| for (int i = 0; i < outer_size; ++i) { |
| float squared_l2_norm = 0; |
| for (int c = 0; c < depth; ++c) { |
| const float val = input_data[depth * i + c]; |
| squared_l2_norm += val * val; |
| } |
| const float l2_norm = std::sqrt(squared_l2_norm); |
| for (int c = 0; c < depth; ++c) { |
| output_data[depth * i + c] = input_data[depth * i + c] / l2_norm; |
| } |
| } |
| } |
| |
| inline void GetInvSqrtQuantizedMultiplierExp(int32 input, |
| int32* output_inv_sqrt, |
| int* output_shift) { |
| *output_shift = 11; |
| while (input >= (1 << 29)) { |
| input /= 4; |
| ++*output_shift; |
| } |
| TFLITE_DCHECK_GT(input, 0); |
| const unsigned max_left_shift_bits = |
| CountLeadingZeros(static_cast<uint32>(input)) - 1; |
| const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2; |
| const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1; |
| *output_shift -= left_shift_bit_pairs; |
| input <<= 2 * left_shift_bit_pairs; |
| TFLITE_DCHECK_GE(input, (1 << 27)); |
| TFLITE_DCHECK_LT(input, (1 << 29)); |
| using gemmlowp::FixedPoint; |
| using gemmlowp::Rescale; |
| using gemmlowp::SaturatingRoundingMultiplyByPOT; |
| // Using 3 integer bits gives us enough room for the internal arithmetic in |
| // this Newton-Raphson iteration. |
| using F3 = FixedPoint<int32, 3>; |
| using F0 = FixedPoint<int32, 0>; |
| const F3 fixedpoint_input = F3::FromRaw(input >> 1); |
| const F3 fixedpoint_half_input = |
| SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input); |
| const F3 fixedpoint_half_three = |
| GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5); |
| // Newton-Raphson iteration |
| // Naive unoptimized starting guess: x = 1 |
| F3 x = F3::One(); |
| // Naive unoptimized number of iterations: 5 |
| for (int i = 0; i < 5; i++) { |
| const F3 x3 = Rescale<3>(x * x * x); |
| x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3); |
| } |
| const F0 fixedpoint_half_sqrt_2 = |
| GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.); |
| x = x * fixedpoint_half_sqrt_2; |
| *output_inv_sqrt = x.raw(); |
| if (*output_shift < 0) { |
| *output_inv_sqrt <<= -*output_shift; |
| *output_shift = 0; |
| } |
| // Convert right shift (right is positive) to left shift. |
| *output_shift *= kReverseShift; |
| } |
| |
| inline void L2Normalization(const tflite::L2NormalizationParams& op_params, |
| const RuntimeShape& input_shape, |
| const uint8* input_data, |
| const RuntimeShape& output_shape, |
| uint8* output_data) { |
| const int trailing_dim = input_shape.DimensionsCount() - 1; |
| const int depth = |
| MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); |
| const int outer_size = |
| MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); |
| const int32 input_zero_point = op_params.input_zero_point; |
| for (int i = 0; i < outer_size; ++i) { |
| int32 square_l2_norm = 0; |
| for (int c = 0; c < depth; c++) { |
| int32 diff = input_data[depth * i + c] - input_zero_point; |
| square_l2_norm += diff * diff; |
| } |
| int32 inv_l2norm_multiplier; |
| int inv_l2norm_shift; |
| GetInvSqrtQuantizedMultiplierExp(square_l2_norm, &inv_l2norm_multiplier, |
| &inv_l2norm_shift); |
| |
| for (int c = 0; c < depth; c++) { |
| int32 diff = input_data[depth * i + c] - input_zero_point; |
| int32 rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp( |
| 128 * diff, inv_l2norm_multiplier, inv_l2norm_shift); |
| int32 unclamped_output_val = 128 + rescaled_diff; |
| int32 output_val = std::min(255, std::max(0, unclamped_output_val)); |
| output_data[depth * i + c] = static_cast<uint8>(output_val); |
| } |
| } |
| } |
| |
| template <typename T> |
| inline void Add(const ArithmeticParams& params, |
| const RuntimeShape& input1_shape, const T* input1_data, |
| const RuntimeShape& input2_shape, const T* input2_data, |
| const RuntimeShape& output_shape, T* output_data) { |
| const int flat_size = |
| MatchingFlatSize(input1_shape, input2_shape, output_shape); |
| for (int i = 0; i < flat_size; ++i) { |
| output_data[i] = ActivationFunctionWithMinMax( |
| input1_data[i] + input2_data[i], params.quantized_activation_min, |
| params.quantized_activation_max); |
| } |
| } |
| |
| inline void Add(const ArithmeticParams& params, |
| const RuntimeShape& input1_shape, const float* input1_data, |
| const RuntimeShape& input2_shape, const float* input2_data, |
| const RuntimeShape& output_shape, float* output_data) { |
| const int size = MatchingFlatSize(input1_shape, input2_shape, output_shape); |
| for (int i = 0; i < size; i++) { |
| auto x = input1_data[i] + input2_data[i]; |
| output_data[i] = ActivationFunctionWithMinMax( |
| x, params.float_activation_min, params.float_activation_max); |
| } |
| } |
| |
| // Element-wise add that can often be used for inner loop of broadcast add as |
| // well as the non-broadcast add. |
| inline void AddElementwise(int size, const ArithmeticParams& params, |
| const uint8* input1_data, const uint8* input2_data, |
| uint8* output_data) { |
| TFLITE_DCHECK_GT(params.input1_offset, -256); |
| TFLITE_DCHECK_GT(params.input2_offset, -256); |
| TFLITE_DCHECK_LT(params.input1_offset, 256); |
| TFLITE_DCHECK_LT(params.input2_offset, 256); |
| |
| for (int i = 0; i < size; ++i) { |
| const int32 input1_val = params.input1_offset + input1_data[i]; |
| const int32 input2_val = params.input2_offset + input2_data[i]; |
| const int32 shifted_input1_val = input1_val * (1 << params.left_shift); |
| const int32 shifted_input2_val = input2_val * (1 << params.left_shift); |
| const int32 scaled_input1_val = |
| MultiplyByQuantizedMultiplierSmallerThanOneExp( |
| shifted_input1_val, params.input1_multiplier, params.input1_shift); |
| const int32 scaled_input2_val = |
| MultiplyByQuantizedMultiplierSmallerThanOneExp( |
| shifted_input2_val, params.input2_multiplier, params.input2_shift); |
| const int32 raw_sum = scaled_input1_val + scaled_input2_val; |
| const int32 raw_output = |
| MultiplyByQuantizedMultiplierSmallerThanOneExp( |
| raw_sum, params.output_multiplier, params.output_shift) + |
| params.output_offset; |
| const int32 clamped_output = |
| std::min(params.quantized_activation_max, |
| std::max(params.quantized_activation_min, raw_output)); |
| output_data[i] = static_cast<uint8>(clamped_output); |
| } |
| } |
| |
| inline void Add(const ArithmeticParams& params, |
| const RuntimeShape& input1_shape, const uint8* input1_data, |
| const RuntimeShape& input2_shape, const uint8* input2_data, |
| const RuntimeShape& output_shape, uint8* output_data) { |
| TFLITE_DCHECK_LE(params.quantized_activation_min, |
| params.quantized_activation_max); |
| const int flat_size = |
| MatchingFlatSize(input1_shape, input2_shape, output_shape); |
| |
| TFLITE_DCHECK_GT(params.input1_offset, -256); |
| TFLITE_DCHECK_GT(params.input2_offset, -256); |
| TFLITE_DCHECK_LT(params.input1_offset, 256); |
| TFLITE_DCHECK_LT(params.input2_offset, 256); |
| AddElementwise(flat_size, params, input1_data, input2_data, output_data); |
| } |
| |
| inline void Add(const ArithmeticParams& params, |
| const RuntimeShape& input1_shape, const int16* input1_data, |
| const RuntimeShape& input2_shape, const int16* input2_data, |
| const RuntimeShape& output_shape, int16* output_data) { |
| TFLITE_DCHECK_LE(params.quantized_activation_min, |
| params.quantized_activation_max); |
| |
| const int input1_shift = params.input1_shift; |
| const int flat_size = |
| MatchingFlatSize(output_shape, input1_shape, input2_shape); |
| const int16 output_activation_min = params.quantized_activation_min; |
| const int16 output_activation_max = params.quantized_activation_max; |
| |
| TFLITE_DCHECK(input1_shift == 0 || params.input2_shift == 0); |
| TFLITE_DCHECK_LE(input1_shift, 0); |
| TFLITE_DCHECK_LE(params.input2_shift, 0); |
| const int16* not_shift_input = input1_shift == 0 ? input1_data : input2_data; |
| const int16* shift_input = input1_shift == 0 ? input2_data : input1_data; |
| const int input_right_shift = |
| input1_shift == 0 ? -params.input2_shift : -input1_shift; |
| |
| for (int i = 0; i < flat_size; i++) { |
| // F0 uses 0 integer bits, range [-1, 1]. |
| using F0 = gemmlowp::FixedPoint<std::int16_t, 0>; |
| |
| F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]); |
| F0 scaled_input = F0::FromRaw( |
| gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift)); |
| F0 result = gemmlowp::SaturatingAdd(scaled_input, input_ready_scaled); |
| const int16 raw_output = result.raw(); |
| const int16 clamped_output = std::min( |
| output_activation_max, std::max(output_activation_min, raw_output)); |
| output_data[i] = clamped_output; |
| } |
| } |
| |
| // TODO(jiawen): We can implement BroadcastAdd on buffers of arbitrary |
| // dimensionality if the runtime code does a single loop over one dimension |
| // that handles broadcasting as the base case. The code generator would then |
| // generate max(D1, D2) nested for loops. |
| // TODO(benoitjacob): BroadcastAdd is intentionally duplicated from |
| // reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T> |
| // is no longer referenced in this file, move NdArrayDesc<T> from types.h to |
| // reference_ops.h. |
| inline void BroadcastAdd4DSlow(const ArithmeticParams& params, |
| const RuntimeShape& input1_shape, |
| const float* input1_data, |
| const RuntimeShape& input2_shape, |
| const float* input2_data, |
| const RuntimeShape& output_shape, |
| float* output_data) { |
| gemmlowp::ScopedProfilingLabel label("BroadcastAdd4DSlow/float"); |
| NdArrayDesc<4> desc1; |
| NdArrayDesc<4> desc2; |
| NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, |
| &desc2); |
| const RuntimeShape extended_output_shape = |
| RuntimeShape::ExtendedShape(4, output_shape); |
| |
| // In Tensorflow, the dimensions are canonically named (batch_number, row, |
| // col, channel), with extents (batches, height, width, depth), with the |
| // trailing dimension changing most rapidly (channels has the smallest stride, |
| // typically 1 element). |
| // |
| // In generated C code, we store arrays with the dimensions reversed. The |
| // first dimension has smallest stride. |
| // |
| // We name our variables by their Tensorflow convention, but generate C code |
| // nesting loops such that the innermost loop has the smallest stride for the |
| // best cache behavior. |
| for (int b = 0; b < extended_output_shape.Dims(0); ++b) { |
| for (int y = 0; y < extended_output_shape.Dims(1); ++y) { |
| for (int x = 0; x < extended_output_shape.Dims(2); ++x) { |
| for (int c = 0; c < extended_output_shape.Dims(3); ++c) { |
| output_data[Offset(extended_output_shape, b, y, x, c)] = |
| ActivationFunctionWithMinMax( |
| input1_data[SubscriptToIndex(desc1, b, y, x, c)] + |
| input2_data[SubscriptToIndex(desc2, b, y, x, c)], |
| params.float_activation_min, params.float_activation_max); |
| } |
| } |
| } |
| } |
| } |
| |
| inline void BroadcastAdd4DSlow(const ArithmeticParams& params, |
| const RuntimeShape& input1_shape, |
| const int32* input1_data, |
| const RuntimeShape& input2_shape, |
| const int32* input2_data, |
| const RuntimeShape& output_shape, |
| int32* output_data) { |
| gemmlowp::ScopedProfilingLabel label("BroadcastAdd4DSlow/int32"); |
| NdArrayDesc<4> desc1; |
| NdArrayDesc<4> desc2; |
| NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, |
| &desc2); |
| const RuntimeShape extended_output_shape = |
| RuntimeShape::ExtendedShape(4, output_shape); |
| |
| // In Tensorflow, the dimensions are canonically named (batch_number, row, |
| // col, channel), with extents (batches, height, width, depth), with the |
| // trailing dimension changing most rapidly (channels has the smallest stride, |
| // typically 1 element). |
| // |
| // In generated C code, we store arrays with the dimensions reversed. The |
| // first dimension has smallest stride. |
| // |
| // We name our variables by their Tensorflow convention, but generate C code |
| // nesting loops such that the innermost loop has the smallest stride for the |
| // best cache behavior. |
| for (int b = 0; b < extended_output_shape.Dims(0); ++b) { |
| for (int y = 0; y < extended_output_shape.Dims(1); ++y) { |
| for (int x = 0; x < extended_output_shape.Dims(2); ++x) { |
| for (int c = 0; c < extended_output_shape.Dims(3); ++c) { |
| output_data[Offset(extended_output_shape, b, y, x, c)] = |
| ActivationFunctionWithMinMax( |
| input1_data[SubscriptToIndex(desc1, b, y, x, c)] + |
| input2_data[SubscriptToIndex(desc2, b, y, x, c)], |
| params.quantized_activation_min, |
| params.quantized_activation_max); |
| } |
| } |
| } |
| } |
| } |
| |
| inline void BroadcastAdd4DSlow(const ArithmeticParams& params, |
| const RuntimeShape& input1_shape, |
| const uint8* input1_data, |
| const RuntimeShape& input2_shape, |
| const uint8* input2_data, |
| const RuntimeShape& output_shape, |
| uint8* output_data) { |
| gemmlowp::ScopedProfilingLabel label("BroadcastAdd4DSlow/uint8"); |
| NdArrayDesc<4> desc1; |
| NdArrayDesc<4> desc2; |
| NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, |
| &desc2); |
| const RuntimeShape extended_output_shape = |
| RuntimeShape::ExtendedShape(4, output_shape); |
| |
| // In Tensorflow, the dimensions are canonically named (batch_number, row, |
| // col, channel), with extents (batches, height, width, depth), with the |
| // trailing dimension changing most rapidly (channels has the smallest stride, |
| // typically 1 element). |
| // |
| // In generated C code, we store arrays with the dimensions reversed. The |
| // first dimension has smallest stride. |
| // |
| // We name our variables by their Tensorflow convention, but generate C code |
| // nesting loops such that the innermost loop has the smallest stride for the |
| // best cache behavior. |
| for (int b = 0; b < extended_output_shape.Dims(0); ++b) { |
| for (int y = 0; y < extended_output_shape.Dims(1); ++y) { |
| for (int x = 0; x < extended_output_shape.Dims(2); ++x) { |
| for (int c = 0; c < extended_output_shape.Dims(3); ++c) { |
| const int32 input1_val = |
| params.input1_offset + |
| input1_data[SubscriptToIndex(desc1, b, y, x, c)]; |
| const int32 input2_val = |
| params.input2_offset + |
| input2_data[SubscriptToIndex(desc2, b, y, x, c)]; |
| const int32 shifted_input1_val = |
| input1_val * (1 << params.left_shift); |
| const int32 shifted_input2_val = |
| input2_val * (1 << params.left_shift); |
| const int32 scaled_input1_val = |
| MultiplyByQuantizedMultiplierSmallerThanOneExp( |
| shifted_input1_val, params.input1_multiplier, |
| params.input1_shift); |
| const int32 scaled_input2_val = |
| MultiplyByQuantizedMultiplierSmallerThanOneExp( |
| shifted_input2_val, params.input2_multiplier, |
| params.input2_shift); |
| const int32 raw_sum = scaled_input1_val + scaled_input2_val; |
| const int32 raw_output = |
| MultiplyByQuantizedMultiplierSmallerThanOneExp( |
| raw_sum, params.output_multiplier, params.output_shift) + |
| params.output_offset; |
| const int32 clamped_output = |
| std::min(params.quantized_activation_max, |
| std::max(params.quantized_activation_min, raw_output)); |
| output_data[Offset(extended_output_shape, b, y, x, c)] = |
| static_cast<uint8>(clamped_output); |
| } |
| } |
| } |
| } |
| } |
| |
| inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params, |
| const RuntimeShape& unswitched_input1_shape, |
| const uint8* unswitched_input1_data, |
| const RuntimeShape& unswitched_input2_shape, |
| const uint8* unswitched_input2_data, |
| const RuntimeShape& output_shape, |
| uint8* output_data) { |
| ArithmeticParams switched_params = unswitched_params; |
| switched_params.input1_offset = unswitched_params.input2_offset; |
| switched_params.input1_multiplier = unswitched_params.input2_multiplier; |
| switched_params.input1_shift = unswitched_params.input2_shift; |
| switched_params.input2_offset = unswitched_params.input1_offset; |
| switched_params.input2_multiplier = unswitched_params.input1_multiplier; |
| switched_params.input2_shift = unswitched_params.input1_shift; |
| |
| const bool use_unswitched = |
| unswitched_params.broadcast_category == |
| tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast; |
| |
| const ArithmeticParams& params = |
| use_unswitched ? unswitched_params : switched_params; |
| const uint8* input1_data = |
| use_unswitched ? unswitched_input1_data : unswitched_input2_data; |
| const uint8* input2_data = |
| use_unswitched ? unswitched_input2_data : unswitched_input1_data; |
| |
| // Fivefold nested loops. The second input resets its position for each |
| // iteration of the second loop. The first input resets its position at the |
| // beginning of the fourth loop. The innermost loop is an elementwise add of |
| // sections of the arrays. |
| uint8* output_data_ptr = output_data; |
| const uint8* input1_data_ptr = input1_data; |
| const uint8* input2_data_reset = input2_data; |
| int y0 = params.broadcast_shape[0]; |
| int y1 = params.broadcast_shape[1]; |
| int y2 = params.broadcast_shape[2]; |
| int y3 = params.broadcast_shape[3]; |
| int y4 = params.broadcast_shape[4]; |
| for (int i0 = 0; i0 < y0; ++i0) { |
| const uint8* input2_data_ptr; |
| for (int i1 = 0; i1 < y1; ++i1) { |
| input2_data_ptr = input2_data_reset; |
| for (int i2 = 0; i2 < y2; ++i2) { |
| for (int i3 = 0; i3 < y3; ++i3) { |
| AddElementwise(y4, params, input1_data_ptr, input2_data_ptr, |
| output_data_ptr); |
| input2_data_ptr += y4; |
| output_data_ptr += y4; |
| } |
| input1_data_ptr += y4; |
| } |
| } |
| input2_data_reset = input2_data_ptr; |
| } |
| } |
| |
| template <typename T> |
| inline void Mul(const ArithmeticParams& params, |
| const RuntimeShape& input1_shape, const T* input1_data, |
| const RuntimeShape& input2_shape, const T* input2_data, |
| const RuntimeShape& output_shape, T* output_data) { |
| T output_activation_min; |
| T output_activation_max; |
| GetActivationParams(params, &output_activation_min, &output_activation_max); |
| |
| const int flat_size = |
| MatchingFlatSize(input1_shape, input2_shape, output_shape); |
| for (int i = 0; i < flat_size; ++i) { |
| output_data[i] = ActivationFunctionWithMinMax( |
| input1_data[i] * input2_data[i], output_activation_min, |
| output_activation_max); |
| } |
| } |
| |
| // TODO(jiawen): We can implement BroadcastMul on buffers of arbitrary |
| // dimensionality if the runtime code does a single loop over one dimension |
| // that handles broadcasting as the base case. The code generator would then |
| // generate max(D1, D2) nested for loops. |
| // TODO(benoitjacob): BroadcastMul is intentionally duplicated from |
| // reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T> |
| // is no longer referenced in this file, move NdArrayDesc<T> from types.h to |
| // reference_ops.h. |
| template <typename T> |
| void BroadcastMul4DSlow(const ArithmeticParams& params, |
| const RuntimeShape& unextended_input1_shape, |
| const T* input1_data, |
| const RuntimeShape& unextended_input2_shape, |
| const T* input2_data, |
| const RuntimeShape& unextended_output_shape, |
| T* output_data) { |
| gemmlowp::ScopedProfilingLabel label("BroadcastMul4DSlow"); |
| T output_activation_min; |
| T output_activation_max; |
| GetActivationParams(params, &output_activation_min, &output_activation_max); |
| |
| TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); |
| const RuntimeShape output_shape = |
| RuntimeShape::ExtendedShape(4, unextended_output_shape); |
| |
| NdArrayDesc<4> desc1; |
| NdArrayDesc<4> desc2; |
| NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, |
| unextended_input2_shape, &desc1, &desc2); |
| |
| // In Tensorflow, the dimensions are canonically named (batch_number, row, |
| // col, channel), with extents (batches, height, width, depth), with the |
| // trailing dimension changing most rapidly (channels has the smallest stride, |
| // typically 1 element). |
| // |
| // In generated C code, we store arrays with the dimensions reversed. The |
| // first dimension has smallest stride. |
| // |
| // We name our variables by their Tensorflow convention, but generate C code |
| // nesting loops such that the innermost loop has the smallest stride for the |
| // best cache behavior. |
| for (int b = 0; b < output_shape.Dims(0); ++b) { |
| for (int y = 0; y < output_shape.Dims(1); ++y) { |
| for (int x = 0; x < output_shape.Dims(2); ++x) { |
| for (int c = 0; c < output_shape.Dims(3); ++c) { |
| output_data[Offset(output_shape, b, y, x, c)] = |
| ActivationFunctionWithMinMax( |
| input1_data[SubscriptToIndex(desc1, b, y, x, c)] * |
| input2_data[SubscriptToIndex(desc2, b, y, x, c)], |
| output_activation_min, output_activation_max); |
| } |
| } |
| } |
| } |
| } |
| |
| // Element-wise mul that can often be used for inner loop of broadcast Mul as |
| // well as the non-broadcast Mul. |
| inline void MulElementwise(int size, const ArithmeticParams& params, |
| const uint8* input1_data, const uint8* input2_data, |
| uint8* output_data) { |
| for (int i = 0; i < size; ++i) { |
| const int32 input1_val = params.input1_offset + input1_data[i]; |
| const int32 input2_val = params.input2_offset + input2_data[i]; |
| const int32 unclamped_result = |
| params.output_offset + |
| MultiplyByQuantizedMultiplierSmallerThanOneExp(input1_val * input2_val, |
| params.output_multiplier, |
| params.output_shift); |
| const int32 clamped_output = |
| std::min(params.quantized_activation_max, |
| std::max(params.quantized_activation_min, unclamped_result)); |
| output_data[i] = static_cast<uint8>(clamped_output); |
| } |
| } |
| |
| inline void Mul(const ArithmeticParams& params, |
| const RuntimeShape& input1_shape, const uint8* input1_data, |
| const RuntimeShape& input2_shape, const uint8* input2_data, |
| const RuntimeShape& output_shape, uint8* output_data) { |
| TFLITE_DCHECK_LE(params.quantized_activation_min, |
| params.quantized_activation_max); |
| gemmlowp::ScopedProfilingLabel label("Mul/8bit"); |
| const int flat_size = |
| MatchingFlatSize(input1_shape, input2_shape, output_shape); |
| |
| MulElementwise(flat_size, params, input1_data, input2_data, output_data); |
| } |
| |
| inline void BroadcastMulFivefold(const ArithmeticParams& unswitched_params, |
| const RuntimeShape& unswitched_input1_shape, |
| const uint8* unswitched_input1_data, |
| const RuntimeShape& unswitched_input2_shape, |
| const uint8* unswitched_input2_data, |
| const RuntimeShape& output_shape, |
| uint8* output_data) { |
| ArithmeticParams switched_params = unswitched_params; |
| switched_params.input1_offset = unswitched_params.input2_offset; |
| switched_params.input2_offset = unswitched_params.input1_offset; |
| |
| const bool use_unswitched = |
| unswitched_params.broadcast_category == |
| tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast; |
| |
| const ArithmeticParams& params = |
| use_unswitched ? unswitched_params : switched_params; |
| const uint8* input1_data = |
| use_unswitched ? unswitched_input1_data : unswitched_input2_data; |
| const uint8* input2_data = |
| use_unswitched ? unswitched_input2_data : unswitched_input1_data; |
| |
| // Fivefold nested loops. The second input resets its position for each |
| // iteration of the second loop. The first input resets its position at the |
| // beginning of the fourth loop. The innermost loop is an elementwise Mul of |
| // sections of the arrays. |
| uint8* output_data_ptr = output_data; |
| const uint8* input1_data_ptr = input1_data; |
| const uint8* input2_data_reset = input2_data; |
| int y0 = params.broadcast_shape[0]; |
| int y1 = params.broadcast_shape[1]; |
| int y2 = params.broadcast_shape[2]; |
| int y3 = params.broadcast_shape[3]; |
| int y4 = params.broadcast_shape[4]; |
| for (int i0 = 0; i0 < y0; ++i0) { |
| const uint8* input2_data_ptr; |
| for (int i1 = 0; i1 < y1; ++i1) { |
| input2_data_ptr = input2_data_reset; |
| for (int i2 = 0; i2 < y2; ++i2) { |
| for (int i3 = 0; i3 < y3; ++i3) { |
| MulElementwise(y4, params, input1_data_ptr, input2_data_ptr, |
| output_data_ptr); |
| input2_data_ptr += y4; |
| output_data_ptr += y4; |
| } |
| input1_data_ptr += y4; |
| } |
| } |
| input2_data_reset = input2_data_ptr; |
| } |
| } |
| |
| inline void BroadcastMul4DSlow(const ArithmeticParams& params, |
| const RuntimeShape& input1_shape, |
| const uint8* input1_data, |
| const RuntimeShape& input2_shape, |
| const uint8* input2_data, |
| const RuntimeShape& output_shape, |
| uint8* output_data) { |
| gemmlowp::ScopedProfilingLabel label("BroadcastMul4DSlow/8bit"); |
| |
| NdArrayDesc<4> desc1; |
| NdArrayDesc<4> desc2; |
| // The input shapes are extended as part of NdArrayDesc initialization. |
| NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, |
| &desc2); |
| const RuntimeShape extended_output_shape = |
| RuntimeShape::ExtendedShape(4, output_shape); |
| |
| for (int b = 0; b < extended_output_shape.Dims(0); ++b) { |
| for (int y = 0; y < extended_output_shape.Dims(1); ++y) { |
| for (int x = 0; x < extended_output_shape.Dims(2); ++x) { |
| for (int c = 0; c < extended_output_shape.Dims(3); ++c) { |
| const int32 input1_val = |
| params.input1_offset + |
| input1_data[SubscriptToIndex(desc1, b, y, x, c)]; |
| const int32 input2_val = |
| params.input2_offset + |
| input2_data[SubscriptToIndex(desc2, b, y, x, c)]; |
| const int32 unclamped_result = |
| params.output_offset + |
| MultiplyByQuantizedMultiplierSmallerThanOneExp( |
| input1_val * input2_val, params.output_multiplier, |
| params.output_shift); |
| const int32 clamped_output = std::min( |
| params.quantized_activation_max, |
| std::max(params.quantized_activation_min, unclamped_result)); |
| output_data[Offset(extended_output_shape, b, y, x, c)] = |
| static_cast<uint8>(clamped_output); |
| } |
| } |
| } |
| } |
| } |
| |
| inline void Mul(const ArithmeticParams& params, |
| const RuntimeShape& input1_shape, const int16* input1_data, |
| const RuntimeShape& input2_shape, const int16* input2_data, |
| const RuntimeShape& output_shape, int16* output_data) { |
| gemmlowp::ScopedProfilingLabel label("Mul/Int16"); |
| |
| const int flat_size = |
| MatchingFlatSize(input1_shape, input2_shape, output_shape); |
| |
| for (int i = 0; i < flat_size; i++) { |
| // F0 uses 0 integer bits, range [-1, 1]. |
| using F0 = gemmlowp::FixedPoint<std::int16_t, 0>; |
| |
| F0 unclamped_result = |
| F0::FromRaw(input1_data[i]) * F0::FromRaw(input2_data[i]); |
| output_data[i] = unclamped_result.raw(); |
| } |
| } |
| |
| inline void Mul(const ArithmeticParams& params, |
| const RuntimeShape& input1_shape, const int16* input1_data, |
| const RuntimeShape& input2_shape, const int16* input2_data, |
| const RuntimeShape& output_shape, uint8* output_data) { |
| gemmlowp::ScopedProfilingLabel label("Mul/Int16Uint8"); |
| int32 output_offset = params.output_offset; |
| int32 output_activation_min = params.quantized_activation_min; |
| int32 output_activation_max = params.quantized_activation_max; |
| TFLITE_DCHECK_LE(output_activation_min, output_activation_max); |
| |
| const int flat_size = |
| MatchingFlatSize(input1_shape, input2_shape, output_shape); |
| |
| for (int i = 0; i < flat_size; i++) { |
| // F0 uses 0 integer bits, range [-1, 1]. |
| using F0 = gemmlowp::FixedPoint<std::int16_t, 0>; |
| |
| F0 unclamped_result = |
| F0::FromRaw(input1_data[i]) * F0::FromRaw(input2_data[i]); |
| int16 rescaled_result = |
| gemmlowp::RoundingDivideByPOT(unclamped_result.raw(), 8); |
| int16 clamped_result = |
| std::min<int16>(output_activation_max - output_offset, rescaled_result); |
| clamped_result = |
| std::max<int16>(output_activation_min - output_offset, clamped_result); |
| output_data[i] = output_offset + clamped_result; |
| } |
| } |
| |
| // TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary |
| // dimensionality if the runtime code does a single loop over one dimension |
| // that handles broadcasting as the base case. The code generator would then |
| // generate max(D1, D2) nested for loops. |
| template <typename T> |
| void BroadcastDiv4DSlow(const ArithmeticParams& params, |
| const RuntimeShape& unextended_input1_shape, |
| const T* input1_data, |
| const RuntimeShape& unextended_input2_shape, |
| const T* input2_data, |
| const RuntimeShape& unextended_output_shape, |
| T* output_data) { |
| T output_activation_min; |
| T output_activation_max; |
| GetActivationParams(params, &output_activation_min, &output_activation_max); |
| |
| TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); |
| const RuntimeShape output_shape = |
| RuntimeShape::ExtendedShape(4, unextended_output_shape); |
| |
| NdArrayDesc<4> desc1; |
| NdArrayDesc<4> desc2; |
| NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, |
| unextended_input2_shape, &desc1, &desc2); |
| |
| // In Tensorflow, the dimensions are canonically named (batch_number, row, |
| // col, channel), with extents (batches, height, width, depth), with the |
| // trailing dimension changing most rapidly (channels has the smallest |
| // stride, typically 1 element). |
| // |
| // In generated C code, we store arrays with the dimensions reversed. The |
| // first dimension has smallest stride. |
| // |
| // We name our variables by their Tensorflow convention, but generate C code |
| // nesting loops such that the innermost loop has the smallest stride for |
| // the best cache behavior. |
| for (int b = 0; b < output_shape.Dims(0); ++b) { |
| for (int y = 0; y < output_shape.Dims(1); ++y) { |
| for (int x = 0; x < output_shape.Dims(2); ++x) { |
| for (int c = 0; c < output_shape.Dims(3); ++c) { |
| output_data[Offset(output_shape, b, y, x, c)] = |
| ActivationFunctionWithMinMax( |
| input1_data[SubscriptToIndex(desc1, b, y, x, c)] / |
| input2_data[SubscriptToIndex(desc2, b, y, x, c)], |
| output_activation_min, output_activation_max); |
| } |
| } |
| } |
| } |
| } |
| |
| // TODO(b/80418076): Move to legacy ops file, update invocations. |
| // Legacy Dims<4>. |
| template <typename T> |
| void BroadcastDiv(const T* input1_data, const Dims<4>& input1_dims, |
| const T* input2_data, const Dims<4>& input2_dims, |
| T output_activation_min, T output_activation_max, |
| T* output_data, const Dims<4>& output_dims) { |
| tflite::ArithmeticParams op_params; |
| SetActivationParams(output_activation_min, output_activation_max, &op_params); |
| |
| BroadcastDiv4DSlow(op_params, DimsToShape(input1_dims), input1_data, |
| DimsToShape(input2_dims), input2_data, |
| DimsToShape(output_dims), output_data); |
| } |
| |
| template <typename T> |
| inline void Div(const ArithmeticParams& params, |
| const RuntimeShape& input1_shape, const T* input1_data, |
| const RuntimeShape& input2_shape, const T* input2_data, |
| const RuntimeShape& output_shape, T* output_data) { |
| T output_activation_min; |
| T output_activation_max; |
| GetActivationParams(params, &output_activation_min, &output_activation_max); |
| |
| const int flat_size = |
| MatchingFlatSize(input1_shape, input2_shape, output_shape); |
| for (int i = 0; i < flat_size; ++i) { |
| output_data[i] = ActivationFunctionWithMinMax( |
| input1_data[i] / input2_data[i], output_activation_min, |
| output_activation_max); |
| } |
| } |
| |
| // TODO(b/80418076): Move to legacy ops file, update invocations. |
| // Legacy Dims<4>. |
| template <typename T> |
| inline void Div(const T* input1_data, const Dims<4>& input1_dims, |
| const T* input2_data, const Dims<4>& input2_dims, |
| T output_activation_min, T output_activation_max, |
| T* output_data, const Dims<4>& output_dims) { |
| tflite::ArithmeticParams op_params; |
| SetActivationParams(output_activation_min, output_activation_max, &op_params); |
| |
| Div(op_params, DimsToShape(input1_dims), input1_data, |
| DimsToShape(input2_dims), input2_data, DimsToShape(output_dims), |
| output_data); |
| } |
| |
| inline void SubNonBroadcast(const ArithmeticParams& params, |
| const RuntimeShape& input1_shape, |
| const float* input1_data, |
| const RuntimeShape& input2_shape, |
| const float* input2_data, |
| const RuntimeShape& output_shape, |
| float* output_data) { |
| const int flat_size = |
| MatchingFlatSize(input1_shape, input2_shape, output_shape); |
| for (int i = 0; i < flat_size; ++i) { |
| output_data[i] = ActivationFunctionWithMinMax( |
| input1_data[i] - input2_data[i], params.float_activation_min, |
| params.float_activation_max); |
| } |
| } |
| |
| inline void SubNonBroadcast(const ArithmeticParams& params, |
| const RuntimeShape& input1_shape, |
| const int32* input1_data, |
| const RuntimeShape& input2_shape, |
| const int32* input2_data, |
| const RuntimeShape& output_shape, |
| int32* output_data) { |
| const int flat_size = |
| MatchingFlatSize(input1_shape, input2_shape, output_shape); |
| for (int i = 0; i < flat_size; ++i) { |
| output_data[i] = ActivationFunctionWithMinMax( |
| input1_data[i] - input2_data[i], params.quantized_activation_min, |
| params.quantized_activation_max); |
| } |
| } |
| |
| // TODO(jiawen): We can implement BroadcastSub on buffers of arbitrary |
| // dimensionality if the runtime code does a single loop over one dimension |
| // that handles broadcasting as the base case. The code generator would then |
| // generate max(D1, D2) nested for loops. |
| // TODO(benoitjacob): BroadcastSub is intentionally duplicated from |
| // reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T> |
| // is no longer referenced in this file, move NdArrayDesc<T> from types.h to |
| // reference_ops.h. |
| inline void BroadcastSub4DSlow(const ArithmeticParams& params, |
| const RuntimeShape& input1_shape, |
| const float* input1_data, |
| const RuntimeShape& input2_shape, |
| const float* input2_data, |
| const RuntimeShape& output_shape, |
| float* output_data) { |
| gemmlowp::ScopedProfilingLabel label("BroadcastSub4DSlow/float"); |
| NdArrayDesc<4> desc1; |
| NdArrayDesc<4> desc2; |
| NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, |
| &desc2); |
| const RuntimeShape extended_output_shape = |
| RuntimeShape::ExtendedShape(4, output_shape); |
| |
| // In Tensorflow, the dimensions are canonically named (batch_number, row, |
| // col, channel), with extents (batches, height, width, depth), with the |
| // trailing dimension changing most rapidly (channels has the smallest stride, |
| // typically 1 element). |
| // |
| // In generated C code, we store arrays with the dimensions reversed. The |
| // first dimension has smallest stride. |
| // |
| // We name our variables by their Tensorflow convention, but generate C code |
| // nesting loops such that the innermost loop has the smallest stride for the |
| // best cache behavior. |
| for (int b = 0; b < extended_output_shape.Dims(0); ++b) { |
| for (int y = 0; y < extended_output_shape.Dims(1); ++y) { |
| for (int x = 0; x < extended_output_shape.Dims(2); ++x) { |
| for (int c = 0; c < extended_output_shape.Dims(3); ++c) { |
| output_data[Offset(extended_output_shape, b, y, x, c)] = |
| ActivationFunctionWithMinMax( |
| input1_data[SubscriptToIndex(desc1, b, y, x, c)] - |
| input2_data[SubscriptToIndex(desc2, b, y, x, c)], |
| params.float_activation_min, params.float_activation_max); |
| } |
| } |
| } |
| } |
| } |
| |
| inline void BroadcastSub4DSlow(const ArithmeticParams& params, |
| const RuntimeShape& input1_shape, |
| const uint8* input1_data, |
| const RuntimeShape& input2_shape, |
| const uint8* input2_data, |
| const RuntimeShape& output_shape, |
| uint8* output_data) { |
| gemmlowp::ScopedProfilingLabel label("BroadcastSub4DSlow/uint8"); |
| NdArrayDesc<4> desc1; |
| NdArrayDesc<4> desc2; |
| NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, |
| &desc2); |
| const RuntimeShape extended_output_shape = |
| RuntimeShape::ExtendedShape(4, output_shape); |
| |
| // In Tensorflow, the dimensions are canonically named (batch_number, row, |
| // col, channel), with extents (batches, height, width, depth), with the |
| // trailing dimension changing most rapidly (channels has the smallest stride, |
| // typically 1 element). |
| // |
| // In generated C code, we store arrays with the dimensions reversed. The |
| // first dimension has smallest stride. |
| // |
| // We name our variables by their Tensorflow convention, but generate C code |
| // nesting loops such that the innermost loop has the smallest stride for the |
| // best cache behavior. |
| for (int b = 0; b < extended_output_shape.Dims(0); ++b) { |
| for (int y = 0; y < extended_output_shape.Dims(1); ++y) { |
| for (int x = 0; x < extended_output_shape.Dims(2); ++x) { |
| for (int c = 0; c < extended_output_shape.Dims(3); ++c) { |
| const int32 input1_val = |
| params.input1_offset + |
| input1_data[SubscriptToIndex(desc1, b, y, x, c)]; |
| const int32 input2_val = |
| params.input2_offset + |
| input2_data[SubscriptToIndex(desc2, b, y, x, c)]; |
| const int32 shifted_input1_val = |
| input1_val * (1 << params.left_shift); |
| const int32 shifted_input2_val = |
| input2_val * (1 << params.left_shift); |
| const int32 scaled_input1_val = |
| MultiplyByQuantizedMultiplierSmallerThanOneExp( |
| shifted_input1_val, params.input1_multiplier, |
| params.input1_shift); |
| const int32 scaled_input2_val = |
| MultiplyByQuantizedMultiplierSmallerThanOneExp( |
| shifted_input2_val, params.input2_multiplier, |
| params.input2_shift); |
| const int32 raw_sub = scaled_input1_val - scaled_input2_val; |
| const int32 raw_output = |
| MultiplyByQuantizedMultiplierSmallerThanOneExp( |
| raw_sub, params.output_multiplier, params.output_shift) + |
| params.output_offset; |
| const int32 clamped_output = |
| std::min(params.quantized_activation_max, |
| std::max(params.quantized_activation_min, raw_output)); |
| output_data[Offset(extended_output_shape, b, y, x, c)] = |
| static_cast<uint8>(clamped_output); |
| } |
| } |
| } |
| } |
| } |
| |
| inline void BroadcastSub4DSlow(const ArithmeticParams& params, |
| const RuntimeShape& input1_shape, |
| const int32* input1_data, |
| const RuntimeShape& input2_shape, |
| const int32* input2_data, |
| const RuntimeShape& output_shape, |
| int32* output_data) { |
| gemmlowp::ScopedProfilingLabel label("BroadcastSub4DSlow/int32"); |
| NdArrayDesc<4> desc1; |
| NdArrayDesc<4> desc2; |
| NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, |
| &desc2); |
| const RuntimeShape extended_output_shape = |
| RuntimeShape::ExtendedShape(4, output_shape); |
| |
| // In Tensorflow, the dimensions are canonically named (batch_number, row, |
| // col, channel), with extents (batches, height, width, depth), with the |
| // trailing dimension changing most rapidly (channels has the smallest stride, |
| // typically 1 element). |
| // |
| // In generated C code, we store arrays with the dimensions reversed. The |
| // first dimension has smallest stride. |
| // |
| // We name our variables by their Tensorflow convention, but generate C code |
| // nesting loops such that the innermost loop has the smallest stride for the |
| // best cache behavior. |
| for (int b = 0; b < extended_output_shape.Dims(0); ++b) { |
| for (int y = 0; y < extended_output_shape.Dims(1); ++y) { |
| for (int x = 0; x < extended_output_shape.Dims(2); ++x) { |
| for (int c = 0; c < extended_output_shape.Dims(3); ++c) { |
| output_data[Offset(extended_output_shape, b, y, x, c)] = |
| ActivationFunctionWithMinMax( |
| input1_data[SubscriptToIndex(desc1, b, y, x, c)] - |
| input2_data[SubscriptToIndex(desc2, b, y, x, c)], |
| params.quantized_activation_min, |
| params.quantized_activation_max); |
| } |
| } |
| } |
| } |
| } |
| |
| template <typename T> |
| void BroadcastSub4DSlow(const ArithmeticParams& params, |
| const RuntimeShape& input1_shape, const T* input1_data, |
| const RuntimeShape& input2_shape, const T* input2_data, |
| const RuntimeShape& output_shape, T* output_data) { |
| gemmlowp::ScopedProfilingLabel label("BroadcastSub4DSlow/templated"); |
| NdArrayDesc<4> desc1; |
| NdArrayDesc<4> desc2; |
| NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, |
| &desc2); |
| const RuntimeShape extended_output_shape = |
| RuntimeShape::ExtendedShape(4, output_shape); |
| |
| // In Tensorflow, the dimensions are canonically named (batch_number, row, |
| // col, channel), with extents (batches, height, width, depth), with the |
| // trailing dimension changing most rapidly (channels has the smallest stride, |
| // typically 1 element). |
| // |
| // In generated C code, we store arrays with the dimensions reversed. The |
| // first dimension has smallest stride. |
| // |
| // We name our variables by their Tensorflow convention, but generate C code |
| // nesting loops such that the innermost loop has the smallest stride for the |
| // best cache behavior. |
| for (int b = 0; b < extended_output_shape.Dims(0); ++b) { |
| for (int y = 0; y < extended_output_shape.Dims(1); ++y) { |
| for (int x = 0; x < extended_output_shape.Dims(2); ++x) { |
| for (int c = 0; c < extended_output_shape.Dims(3); ++c) { |
| output_data[Offset(extended_output_shape, b, y, x, c)] = |
| ActivationFunctionWithMinMax( |
| input1_data[SubscriptToIndex(desc1, b, y, x, c)] - |
| input2_data[SubscriptToIndex(desc2, b, y, x, c)], |
| params.quantized_activation_min, |
| params.quantized_activation_max); |
| } |
| } |
| } |
| } |
| } |
| |
| template <typename T> |
| void Sub(const ArithmeticParams& params, const RuntimeShape& input1_shape, |
| const T* input1_data, const RuntimeShape& input2_shape, |
| const T* input2_data, const RuntimeShape& output_shape, |
| T* output_data) { |
| NdArrayDesc<4> desc1; |
| NdArrayDesc<4> desc2; |
| NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, |
| &desc2); |
| const RuntimeShape extended_output_shape = |
| RuntimeShape::ExtendedShape(4, output_shape); |
| |
| // In Tensorflow, the dimensions are canonically named (batch_number, row, |
| // col, channel), with extents (batches, height, width, depth), with the |
| // trailing dimension changing most rapidly (channels has the smallest stride, |
| // typically 1 element). |
| // |
| // In generated C code, we store arrays with the dimensions reversed. The |
| // first dimension has smallest stride. |
| // |
| // We name our variables by their Tensorflow convention, but generate C code |
| // nesting loops such that the innermost loop has the smallest stride for the |
| // best cache behavior. |
| for (int b = 0; b < extended_output_shape.Dims(0); ++b) { |
| for (int y = 0; y < extended_output_shape.Dims(1); ++y) { |
| for (int x = 0; x < extended_output_shape.Dims(2); ++x) { |
| for (int c = 0; c < extended_output_shape.Dims(3); ++c) { |
| output_data[Offset(extended_output_shape, b, y, x, c)] = |
| input1_data[SubscriptToIndex(desc1, b, y, x, c)] - |
| input2_data[SubscriptToIndex(desc2, b, y, x, c)]; |
| } |
| } |
| } |
| } |
| } |
| |
| inline void SubWithActivation(const ArithmeticParams& params, |
| const RuntimeShape& input1_shape, |
| const int32* input1_data, |
| const RuntimeShape& input2_shape, |
| const int32* input2_data, |
| const RuntimeShape& output_shape, |
| int32* output_data) { |
| const int flat_size = |
| MatchingFlatSize(input1_shape, input2_shape, input2_shape); |
| for (int i = 0; i < flat_size; ++i) { |
| output_data[i] = ActivationFunctionWithMinMax( |
| input1_data[i] - input2_data[i], params.quantized_activation_min, |
| params.quantized_activation_max); |
| } |
| } |
| |
| inline void SubWithActivation(const ArithmeticParams& params, |
| const RuntimeShape& input1_shape, |
| const float* input1_data, |
| const RuntimeShape& input2_shape, |
| const float* input2_data, |
| const RuntimeShape& output_shape, |
| float* output_data) { |
| const int flat_size = |
| MatchingFlatSize(input1_shape, input2_shape, input2_shape); |
| for (int i = 0; i < flat_size; ++i) { |
| output_data[i] = ActivationFunctionWithMinMax( |
| input1_data[i] - input2_data[i], params.float_activation_min, |
| params.float_activation_max); |
| } |
| } |
| |
| template <typename Scalar> |
| inline void Concatenation(const ConcatenationParams& params, |
| const RuntimeShape* const* input_shapes, |
| const Scalar* const* input_data, |
| const RuntimeShape& output_shape, |
| Scalar* output_data) { |
| int axis = params.axis; |
| int inputs_count = params.inputs_count; |
| const int concat_dimensions = output_shape.DimensionsCount(); |
| TFLITE_DCHECK_LT(axis, concat_dimensions); |
| |
| int64_t concat_size = 0; |
| for (int i = 0; i < inputs_count; i++) { |
| TFLITE_DCHECK_EQ(input_shapes[i]->DimensionsCount(), concat_dimensions); |
| for (int j = 0; j < concat_dimensions; j++) { |
| if (j != axis) { |
| MatchingDim(*input_shapes[i], j, output_shape, j); |
| } |
| } |
| concat_size += input_shapes[i]->Dims(axis); |
| } |
| TFLITE_DCHECK_EQ(concat_size, output_shape.Dims(axis)); |
| int64_t outer_size = 1; |
| for (int i = 0; i < axis; ++i) { |
| outer_size *= output_shape.Dims(i); |
| } |
| // For all input arrays, |
| // FlatSize() = outer_size * Dims(axis) * base_inner_size; |
| int64_t base_inner_size = 1; |
| for (int i = axis + 1; i < concat_dimensions; ++i) { |
| base_inner_size *= output_shape.Dims(i); |
| } |
| |
| Scalar* output_ptr = output_data; |
| for (int k = 0; k < outer_size; k++) { |
| for (int i = 0; i < inputs_count; ++i) { |
| const int copy_size = input_shapes[i]->Dims(axis) * base_inner_size; |
| memcpy(output_ptr, input_data[i] + k * copy_size, |
| copy_size * sizeof(Scalar)); |
| output_ptr += copy_size; |
| } |
| } |
| } |
| |
| // TODO(b/80418076): Move to legacy ops file, update invocations. |
| // Legacy Dims<4>. |
| template <FusedActivationFunctionType Ac, typename Scalar> |
| inline void Concatenation(int concat_dim, const Scalar* const* input_data, |
| const Dims<4>* const* input_dims, int inputs_count, |
| Scalar* output_data, const Dims<4>& output_dims) { |
| // For now we don't have a model with a Concatenation with fused activation. |
| TFLITE_DCHECK_EQ(Ac, FusedActivationFunctionType::kNone); |
| |
| std::vector<RuntimeShape> input_shapes(inputs_count); |
| std::vector<const RuntimeShape*> input_shapes_indirect(inputs_count); |
| for (int i = 0; i < inputs_count; ++i) { |
| ShapeFromDims(*input_dims[i], &input_shapes[i]); |
| input_shapes_indirect[i] = &input_shapes[i]; |
| } |
| tflite::ConcatenationParams op_params; |
| op_params.axis = 3 - concat_dim; |
| op_params.inputs_count = inputs_count; |
| |
| Concatenation(op_params, input_shapes_indirect.data(), input_data, |
| DimsToShape(output_dims), output_data); |
| } |
| |
| // TODO(prabhumk): This is the same as the optimized implementation. |
| // TODO(prabhumk): The quantized implementation of concatentation isn't fully |
| // quantized as it takes scale as a floating point value. This should be fixed |
| // when optimizng this routine further. |
| |
| // template <> |
| inline void ConcatenationWithScaling(const ConcatenationParams& params, |
| const RuntimeShape* const* input_shapes, |
| const uint8* const* input_data, |
| const RuntimeShape& output_shape, |
| uint8* output_data) { |
| int axis = params.axis; |
| const int32* input_zeropoint = params.input_zeropoint; |
| const float* input_scale = params.input_scale; |
| int inputs_count = params.inputs_count; |
| const int32 output_zeropoint = params.output_zeropoint; |
| const float output_scale = params.output_scale; |
| |
| // The arguments input_zeropoint and input_scale are expected to be an array |
| // that have the quantization parameters for all the inputs to the concat |
| // operator. |
| TFLITE_DCHECK_GT(inputs_count, 1); |
| TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); |
| int64_t concat_size = 0; |
| for (int i = 0; i < inputs_count; i++) { |
| TFLITE_DCHECK_EQ(input_shapes[i]->DimensionsCount(), 4); |
| for (int j = 0; j < 4; j++) { |
| if (j != axis) { |
| MatchingDim(*input_shapes[i], j, output_shape, j); |
| } |
| } |
| concat_size += input_shapes[i]->Dims(axis); |
| } |
| TFLITE_DCHECK_EQ(concat_size, output_shape.Dims(axis)); |
| int64_t outer_size = 1; |
| for (int i = 0; i < axis; ++i) { |
| outer_size *= output_shape.Dims(i); |
| } |
| // For all input arrays, |
| // FlatSize() = outer_size * Dims(axis) * base_inner_size; |
| int64_t base_inner_size = 1; |
| for (int i = axis + 1; i < 4; ++i) { |
| base_inner_size *= output_shape.Dims(i); |
| } |
| const float inverse_output_scale = 1.f / output_scale; |
| uint8* output_ptr = output_data; |
| for (int k = 0; k < outer_size; k++) { |
| for (int i = 0; i < inputs_count; ++i) { |
| const int copy_size = input_shapes[i]->Dims(axis) * base_inner_size; |
| const uint8* input_ptr = input_data[i] + k * copy_size; |
| if (input_zeropoint[i] == output_zeropoint && |
| input_scale[i] == output_scale) { |
| memcpy(output_ptr, input_ptr, copy_size); |
| } else { |
| const float scale = input_scale[i] * inverse_output_scale; |
| const float bias = -input_zeropoint[i] * scale; |
| for (int j = 0; j < copy_size; ++j) { |
| const int32_t value = |
| static_cast<int32_t>(round(input_ptr[j] * scale + bias)) + |
| output_zeropoint; |
| output_ptr[j] = |
| static_cast<uint8_t>(std::max(std::min(255, value), 0)); |
| } |
| } |
| output_ptr += copy_size; |
| } |
| } |
| } |
| |
| // TODO(b/80418076): Move to legacy ops file, update invocations. |
| // Legacy Dims<4>. |
| inline void Concatenation(int concat_dim, const uint8* const* input_data, |
| const Dims<4>* const* input_dims, |
| const int32* input_zeropoint, |
| const float* input_scale, int inputs_count, |
| uint8* output_data, const Dims<4>& output_dims, |
| const int32 output_zeropoint, |
| const float output_scale) { |
| std::vector<RuntimeShape> input_shapes(inputs_count); |
| std::vector<const RuntimeShape*> input_shapes_indirect(inputs_count); |
| for (int i = 0; i < inputs_count; ++i) { |
| ShapeFromDims(*input_dims[i], &input_shapes[i]); |
| input_shapes_indirect[i] = &input_shapes[i]; |
| } |
| tflite::ConcatenationParams op_params; |
| op_params.axis = 3 - concat_dim; |
| op_params.input_zeropoint = input_zeropoint; |
| op_params.input_scale = input_scale; |
| op_params.inputs_count = inputs_count; |
| op_params.output_zeropoint = output_zeropoint; |
| op_params.output_scale = output_scale; |
| |
| ConcatenationWithScaling(op_params, input_shapes_indirect.data(), input_data, |
| DimsToShape(output_dims), output_data); |
| } |
| |
| template <typename Scalar> |
| void Pack(int dim, const Scalar* const* input_data, |
| const Dims<4>* const* input_dims, int inputs_count, |
| Scalar* output_data, const Dims<4>& output_dims) { |
| TFLITE_DCHECK(IsPackedWithoutStrides(output_dims)); |
| int outer_size = 1; |
| for (int i = dim + 1; i < 4; i++) { |
| outer_size *= output_dims.sizes[i]; |
| } |
| Scalar* output_ptr = output_data; |
| const int copy_size = FlatSize(**input_dims) / outer_size; |
| for (int k = 0; k < outer_size; k++) { |
| for (int i = 0; i < inputs_count; ++i) { |
| memcpy(output_ptr, input_data[i] + k * copy_size, |
| copy_size * sizeof(Scalar)); |
| output_ptr += copy_size; |
| } |
| } |
| } |
| |
| template <typename Scalar> |
| void Unpack(int axis, const Scalar* input_data, const Dims<4>& input_dims, |
| int dimensions, int outputs_count, Scalar* const* output_datas, |
| const Dims<4>& output_dims) { |
| int outer_size = 1; |
| for (int i = dimensions - axis; i < 4; i++) { |
| outer_size *= input_dims.sizes[i]; |
| } |
| |
| const int copy_size = FlatSize(input_dims) / outer_size / outputs_count; |
| for (int k = 0; k < outer_size; k++) { |
| for (int i = 0; i < outputs_count; ++i) { |
| Scalar* output_ptr = output_datas[i] + copy_size * k; |
| int loc = k * outputs_count * copy_size + i * copy_size; |
| memcpy(output_ptr, input_data + loc, copy_size * sizeof(Scalar)); |
| } |
| } |
| } |
| |
| template <typename Scalar> |
| void Pack(int dim, const Scalar* const* input_data, |
| const Dims<4>* const* input_dims, const int32* input_zeropoint, |
| const float* input_scale, int inputs_count, Scalar* output_data, |
| const Dims<4>& output_dims, const int32 output_zeropoint, |
| const float output_scale) { |
| TFLITE_DCHECK(IsPackedWithoutStrides(output_dims)); |
| int outer_size = 1; |
| for (int i = dim + 1; i < 4; i++) { |
| outer_size *= output_dims.sizes[i]; |
| } |
| Scalar* output_ptr = output_data; |
| const int copy_size = FlatSize(**input_dims) / outer_size; |
| const float inverse_output_scale = 1.f / output_scale; |
| for (int k = 0; k < outer_size; k++) { |
| for (int i = 0; i < inputs_count; ++i) { |
| if (input_zeropoint[i] == output_zeropoint && |
| input_scale[i] == output_scale) { |
| memcpy(output_ptr, input_data[i] + k * copy_size, |
| copy_size * sizeof(Scalar)); |
| } else { |
| assert(false); |
| const float scale = input_scale[i] * inverse_output_scale; |
| const float bias = -input_zeropoint[i] * scale; |
| auto input_ptr = input_data[i]; |
| for (int j = 0; j < copy_size; ++j) { |
| const int32_t value = |
| static_cast<int32_t>(round(input_ptr[j] * scale + bias)) + |
| output_zeropoint; |
| output_ptr[j] = |
| static_cast<uint8_t>(std::max(std::min(255, value), 0)); |
| } |
| } |
| output_ptr += copy_size; |
| } |
| } |
| } |
| |
| template <FusedActivationFunctionType Ac, typename Scalar> |
| void DepthConcatenation(const Scalar* const* input_data, |
| const Dims<4>* const* input_dims, int inputs_count, |
| Scalar* output_data, const Dims<4>& output_dims) { |
| Concatenation<Ac, Scalar>(0, input_data, input_dims, inputs_count, |
| output_data, output_dims); |
| } |
| |
| inline void LstmCell( |
| const LstmCellParams& params, const RuntimeShape& unextended_input_shape, |
| const float* input_data, const RuntimeShape& unextended_prev_activ_shape, |
| const float* prev_activ_data, const RuntimeShape& weights_shape, |
| const float* weights_data, const RuntimeShape& unextended_bias_shape, |
| const float* bias_data, const RuntimeShape& unextended_prev_state_shape, |
| const float* prev_state_data, |
| const RuntimeShape& unextended_output_state_shape, float* output_state_data, |
| const RuntimeShape& unextended_output_activ_shape, float* output_activ_data, |
| const RuntimeShape& unextended_concat_temp_shape, float* concat_temp_data, |
| const RuntimeShape& unextended_activ_temp_shape, float* activ_temp_data) { |
| TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_LE(unextended_prev_activ_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_LE(unextended_bias_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_LE(unextended_prev_state_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_LE(unextended_output_state_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_LE(unextended_output_activ_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_LE(unextended_concat_temp_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_LE(unextended_activ_temp_shape.DimensionsCount(), 4); |
| const RuntimeShape input_shape = |
| RuntimeShape::ExtendedShape(4, unextended_input_shape); |
| const RuntimeShape prev_activ_shape = |
| RuntimeShape::ExtendedShape(4, unextended_prev_activ_shape); |
| const RuntimeShape bias_shape = |
| RuntimeShape::ExtendedShape(4, unextended_bias_shape); |
| const RuntimeShape prev_state_shape = |
| RuntimeShape::ExtendedShape(4, unextended_prev_state_shape); |
| const RuntimeShape output_state_shape = |
| RuntimeShape::ExtendedShape(4, unextended_output_state_shape); |
| const RuntimeShape output_activ_shape = |
| RuntimeShape::ExtendedShape(4, unextended_output_activ_shape); |
| const RuntimeShape concat_temp_shape = |
| RuntimeShape::ExtendedShape(4, unextended_concat_temp_shape); |
| const RuntimeShape activ_temp_shape = |
| RuntimeShape::ExtendedShape(4, unextended_activ_temp_shape); |
| TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2); |
| |
| const int weights_dim_count = weights_shape.DimensionsCount(); |
| const int batches = |
| MatchingDim(input_shape, 0, prev_activ_shape, 0, prev_state_shape, 0, |
| output_state_shape, 0, output_activ_shape, 0); |
| const int height = |
| MatchingDim(input_shape, 1, prev_activ_shape, 1, prev_state_shape, 1, |
| output_state_shape, 1, output_activ_shape, 1); |
| const int width = |
| MatchingDim(input_shape, 2, prev_activ_shape, 2, prev_state_shape, 2, |
| output_state_shape, 2, output_activ_shape, 2); |
| const int input_depth = input_shape.Dims(3); |
| const int prev_activ_depth = prev_activ_shape.Dims(3); |
| const int total_input_depth = prev_activ_depth + input_depth; |
| TFLITE_DCHECK_EQ(weights_shape.Dims(weights_dim_count - 1), |
| total_input_depth); |
| TFLITE_DCHECK_EQ(FlatSizeSkipDim(bias_shape, 3), 1); |
| const int intern_activ_depth = |
| MatchingDim(weights_shape, weights_dim_count - 2, bias_shape, 3); |
| TFLITE_DCHECK_EQ(weights_shape.FlatSize(), |
| intern_activ_depth * total_input_depth); |
| TFLITE_DCHECK_EQ(intern_activ_depth % 4, 0); |
| const int output_depth = |
| MatchingDim(prev_state_shape, 3, prev_activ_shape, 3, output_state_shape, |
| 3, output_activ_shape, 3); |
| TFLITE_DCHECK_EQ(output_depth, intern_activ_depth / 4); |
| |
| // Concatenate prev_activ and input data together |
| std::vector<float const*> concat_input_arrays_data; |
| std::vector<RuntimeShape const*> concat_input_arrays_shapes; |
| concat_input_arrays_data.push_back(input_data); |
| concat_input_arrays_data.push_back(prev_activ_data); |
| concat_input_arrays_shapes.push_back(&input_shape); |
| concat_input_arrays_shapes.push_back(&prev_activ_shape); |
| tflite::ConcatenationParams concat_params; |
| concat_params.axis = 3; |
| concat_params.inputs_count = concat_input_arrays_data.size(); |
| Concatenation(concat_params, &(concat_input_arrays_shapes[0]), |
| &(concat_input_arrays_data[0]), concat_temp_shape, |
| concat_temp_data); |
| |
| // Fully connected |
| tflite::FullyConnectedParams fc_params; |
| fc_params.float_activation_min = std::numeric_limits<float>::lowest(); |
| fc_params.float_activation_max = std::numeric_limits<float>::max(); |
| FullyConnected(fc_params, concat_temp_shape, concat_temp_data, weights_shape, |
| weights_data, bias_shape, bias_data, activ_temp_shape, |
| activ_temp_data); |
| |
| // Memory state update (the LSTM "guts") |
| for (int b = 0; b < batches; ++b) { |
| for (int w = 0; w < width; ++w) { |
| for (int h = 0; h < height; ++h) { |
| for (int c = 0; c < output_depth; ++c) { |
| const float input_gate = |
| 1.f / |
| (1.f + std::exp(-activ_temp_data[Offset(activ_temp_shape, b, h, w, |
| 0 * output_depth + c)])); |
| const float new_input = std::tanh(activ_temp_data[Offset( |
| activ_temp_shape, b, h, w, 1 * output_depth + c)]); |
| const float forget_gate = |
| 1.f / |
| (1.f + std::exp(-activ_temp_data[Offset(activ_temp_shape, b, h, w, |
| 2 * output_depth + c)])); |
| const float output_gate = |
| 1.f / |
| (1.f + std::exp(-activ_temp_data[Offset(activ_temp_shape, b, h, w, |
| 3 * output_depth + c)])); |
| const float new_state = |
| input_gate * new_input + |
| forget_gate * |
| prev_state_data[Offset(prev_state_shape, b, h, w, c)]; |
| output_state_data[Offset(output_state_shape, b, h, w, c)] = new_state; |
| output_activ_data[Offset(output_activ_shape, b, h, w, c)] = |
| output_gate * std::tanh(new_state); |
| } |
| } |
| } |
| } |
| } |
| |
| // TODO(b/80418076): Move to legacy ops file, update invocations. |
| // Legacy. |
| inline void LstmCell(const float* input_data, const Dims<4>& input_dims, |
| const float* prev_activ_data, |
| const Dims<4>& prev_activ_dims, const float* weights_data, |
| const Dims<4>& weights_dims, const float* bias_data, |
| const Dims<4>& bias_dims, const float* prev_state_data, |
| const Dims<4>& prev_state_dims, float* output_state_data, |
| const Dims<4>& output_state_dims, float* output_activ_data, |
| const Dims<4>& output_activ_dims, float* concat_temp_data, |
| const Dims<4>& concat_temp_dims, float* activ_temp_data, |
| const Dims<4>& activ_temp_dims) { |
| tflite::LstmCellParams op_params; |
| // Float LSTM cell does not need parameters to be set: leave untouched. |
| |
| LstmCell(op_params, DimsToShape(input_dims), input_data, |
| DimsToShape(prev_activ_dims), prev_activ_data, |
| DimsToShape(weights_dims), weights_data, DimsToShape(bias_dims), |
| bias_data, DimsToShape(prev_state_dims), prev_state_data, |
| DimsToShape(output_state_dims), output_state_data, |
| DimsToShape(output_activ_dims), output_activ_data, |
| DimsToShape(concat_temp_dims), concat_temp_data, |
| DimsToShape(activ_temp_dims), activ_temp_data); |
| } |
| |
| // Quantized LSTM cell implementation. |
| // The quantization of the input, output arrays is as follows: |
| // - The input activations are quantized as uint8 on the interval |
| // [-1, 127/128]. |
| // The rationale for that is that is the natural interval for output |
| // activations (see next point) and these need to be concatenated together. |
| // We could accommodate different ranges by re-scaling, but we empirically |
| // found that setting the input activations range to be [-1, 127/128] in the |
| // first place, removing the need for re-scaling, greatly improves accuracy. |
| // - The output activations are quantized as uint8 on the interval |
| // [-1, 127/128]. |
| // The rationale for that is that the definition of a LSTM cell makes them |
| // intrinsically constrained in [-1, 1]; tweaking that to [-1, 127/128] |
| // makes for simpler, more accurate fixed-point arithmetic. |
| // - The output-at-previous-timestep state array is obviously quantized as |
| // the output activations. |
| // - The internal LSTM memory (not the output-at-previous-timestep, the other |
| // internal state array) is int16-quantized and may use any power-of-two, |
| // symmetric range i.e. [-2^N, 2^N * 32767/32768] for any N, which we call |
| // StateIntegerBits below, see the below discussion of that template |
| // parameter ("The StateIntegerBits template parameter"). |
| // - The output of the internal fully-connected node is int16-quantized |
| // on the interval [-8, 8 * 32767/32768], the rationale for which is |
| // explained just below ("Why [-8, 8] for fully-connected output?"). |
| // |
| // |
| // === The StateIntegerBits template parameter === |
| // |
| // The StateIntegerBits template parameter controls the fixed-point format used |
| // to represent the internal memory of the LSTM cell (not the |
| // output-at-previous-timestep, the other internal state array). It's currently |
| // a template parameter so that the model can control that. The most typical |
| // value for StateIntegerBits is 4. Other plausible values are anywhere between |
| // 3 and 5. We might eventually standardize on a single supported value, e.g. 4, |
| // and drop that template parameter. The reason why it can't be a runtime |
| // parameter is that this controls the fixed-point format used, i.e. we need to |
| // generate actually different code based on it. In particular, we generate code |
| // for a fixed-point tanh() implementation for that format, which internally |
| // uses a fixed-point exp() implementation, which internally uses a |
| // barrel-shifter with a number of steps that depends on StateIntegerBits. |
| // Another consequence of that is that a higher value of StateIntegerBits |
| // results in a more expensive implementation (more barrel shifter steps |
| // needed). |
| // |
| // |
| // === Why [-8, 8] for fully-connected output? === |
| // |
| // This array is only fed to Logistic and Tanh functions, for which |
| // the quantized implementation will want to use fixed-point arithmetic, |
| // requiring a power-of-two representation interval. Thus, we should right |
| // away quantize this array to a power-of-two interval; otherwise, |
| // implementation will need to rescale that, losing any benefit that a tighter |
| // representation interval might otherwise yield, while introducing some |
| // numerical error and computational overhead. |
| // |
| // Now, Logistic and Tanh |
| // are nearly constant (nearly equal to their horizontal asymptotes) |
| // outside of a small bounded interval around 0: |
| // |
| // Logistic(4) = 1 - 1.8e-2 Tanh(4) = 1 - 6.7e-4 |
| // Logistic(8) = 1 - 3.4e-4 Tanh(8) = 1 - 2.3e-7 |
| // Logistic(16) = 1 - 1.1e-7 Tanh(16) = 1 - 2.5e-14 |
| // |
| // From this, we see that clamping to [-4, 4] would be too inaccurate |
| // (the error of 1.8e-2 on Logistic would be felt even in 8bit precision) |
| // while clamping to [-16, 16] would make no difference even in float32. |
| // However, for a fixed-point implementation in 16-bit integers, using 5 |
| // integer bits to represent the [-16, 16] range would leave only 11 |
| // fractional bits, giving an increment of 2^-11 = 4.9e-4 between consecutive |
| // representable values. Notice that is higher than the |
| // worst-case clamping error with clamping to [-8, 8]: 3.4e-4 for Logistic. |
| // Using [-8, 8] thus seems like the better compromise overall, enjoying |
| // an increment of 2.4e-4 between representable values and a worst-case |
| // clamping error of 3.4e-4, both better than the increment of 4.9e-4 with |
| // [-16, 16]. |
| // |
| // Moreover, all other things being equal, it is nice to choose the narrower |
| // representation range, as that makes the implementation of fixed-point |
| // math functions a little cheaper (each integer bit requires an additional |
| // barrel-shifter atep in the implementation of exp(-x)). That is further |
| // reason to prefer [-8, 8] over [-16, 16]. The choice of [-16, 16] would make |
| // sense for 32-bit float or 32-bit fixed-point quantization, but we are |
| // aiming for 16-bit fixed-point quantization of these internal nodes here. |
| // |
| template <int StateIntegerBits> |
| inline void LstmCell( |
| const LstmCellParams& params, const RuntimeShape& unextended_input_shape, |
| const uint8* input_data_uint8, |
| const RuntimeShape& unextended_prev_activ_shape, |
| const uint8* prev_activ_data_uint8, const RuntimeShape& weights_shape, |
| const uint8* weights_data_uint8, const RuntimeShape& unextended_bias_shape, |
| const int32* bias_data_int32, |
| const RuntimeShape& unextended_prev_state_shape, |
| const int16* prev_state_data_int16, |
| const RuntimeShape& unextended_output_state_shape, |
| int16* output_state_data_int16, |
| const RuntimeShape& unextended_output_activ_shape, |
| uint8* output_activ_data_uint8, |
| const RuntimeShape& unextended_concat_temp_shape, |
| uint8* concat_temp_data_uint8, |
| const RuntimeShape& unextended_activ_temp_shape, |
| int16* activ_temp_data_int16, gemmlowp::GemmContext* gemm_context) { |
| (void)gemm_context; // only used in optimized code. |
| int32 weights_zero_point = params.weights_zero_point; |
| int32 accum_multiplier = params.accum_multiplier; |
| int accum_shift = params.accum_shift; |
| TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_LE(unextended_prev_activ_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_LE(unextended_bias_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_LE(unextended_prev_state_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_LE(unextended_output_state_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_LE(unextended_output_activ_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_LE(unextended_concat_temp_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_LE(unextended_activ_temp_shape.DimensionsCount(), 4); |
| const RuntimeShape input_shape = |
| RuntimeShape::ExtendedShape(4, unextended_input_shape); |
| const RuntimeShape prev_activ_shape = |
| RuntimeShape::ExtendedShape(4, unextended_prev_activ_shape); |
| const RuntimeShape bias_shape = |
| RuntimeShape::ExtendedShape(4, unextended_bias_shape); |
| const RuntimeShape prev_state_shape = |
| RuntimeShape::ExtendedShape(4, unextended_prev_state_shape); |
| const RuntimeShape output_state_shape = |
| RuntimeShape::ExtendedShape(4, unextended_output_state_shape); |
| const RuntimeShape output_activ_shape = |
| RuntimeShape::ExtendedShape(4, unextended_output_activ_shape); |
| const RuntimeShape concat_temp_shape = |
| RuntimeShape::ExtendedShape(4, unextended_concat_temp_shape); |
| const RuntimeShape activ_temp_shape = |
| RuntimeShape::ExtendedShape(4, unextended_activ_temp_shape); |
| TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2); |
| |
| // Gather dimensions information, and perform consistency checks. |
| const int weights_dim_count = weights_shape.DimensionsCount(); |
| const int outer_size = MatchingFlatSizeSkipDim( |
| input_shape, 3, prev_activ_shape, prev_state_shape, output_state_shape, |
| output_activ_shape); |
| const int input_depth = input_shape.Dims(3); |
| const int prev_activ_depth = prev_activ_shape.Dims(3); |
| const int total_input_depth = prev_activ_depth + input_depth; |
| TFLITE_DCHECK_EQ(weights_shape.Dims(weights_dim_count - 1), |
| total_input_depth); |
| const int intern_activ_depth = |
| MatchingDim(weights_shape, weights_dim_count - 2, bias_shape, 3); |
| TFLITE_DCHECK_EQ(weights_shape.FlatSize(), |
| intern_activ_depth * total_input_depth); |
| TFLITE_DCHECK_EQ(FlatSizeSkipDim(bias_shape, 3), 1); |
| TFLITE_DCHECK_EQ(intern_activ_depth % 4, 0); |
| const int output_depth = |
| MatchingDim(prev_state_shape, 3, prev_activ_shape, 3, output_state_shape, |
| 3, output_activ_shape, 3); |
| TFLITE_DCHECK_EQ(output_depth, intern_activ_depth / 4); |
| const int fc_batches = FlatSizeSkipDim(activ_temp_shape, 3); |
| const int fc_output_depth = |
| MatchingDim(weights_shape, weights_dim_count - 2, activ_temp_shape, 3); |
| const int fc_accum_depth = total_input_depth; |
| TFLITE_DCHECK_EQ(fc_output_depth, 4 * output_depth); |
| |
| // Depth-concatenate prev_activ and input data together. |
| uint8 const* concat_input_arrays_data[2] = {input_data_uint8, |
| prev_activ_data_uint8}; |
| const RuntimeShape* concat_input_arrays_shapes[2] = {&input_shape, |
| &prev_activ_shape}; |
| tflite::ConcatenationParams concat_params; |
| concat_params.axis = 3; |
| concat_params.inputs_count = 2; |
| Concatenation(concat_params, concat_input_arrays_shapes, |
| concat_input_arrays_data, concat_temp_shape, |
| concat_temp_data_uint8); |
| |
| // Implementation of the fully connected node inside the LSTM cell. |
| // The operands are 8-bit integers, the accumulators are internally 32bit |
| // integers, and the output is 16-bit fixed-point with 3 integer bits so |
| // the output range is [-2^3, 2^3] == [-8, 8]. The rationale for that |
| // is explained in the function comment above. |
| for (int b = 0; b < fc_batches; ++b) { |
| for (int out_c = 0; out_c < fc_output_depth; ++out_c) { |
| // Internal accumulation. |
| // Initialize accumulator with the bias-value. |
| int32 accum = bias_data_int32[out_c]; |
| // Accumulation loop. |
| for (int d = 0; d < fc_accum_depth; ++d) { |
| int16 input_val = concat_temp_data_uint8[b * fc_accum_depth + d] - 128; |
| int16 weights_val = |
| weights_data_uint8[out_c * fc_accum_depth + d] - weights_zero_point; |
| accum += input_val * weights_val; |
| } |
| // Down-scale the final int32 accumulator to the scale used by our |
| // (16-bit, using 3 integer bits) fixed-point format. The quantized |
| // multiplier and shift here have been pre-computed offline |
| // (e.g. by toco). |
| accum = |
| MultiplyByQuantizedMultiplier(accum, accum_multiplier, accum_shift); |
| // Saturate, cast to int16, and store to the temporary activations array. |
| accum = std::max(-32768, std::min(32767, accum)); |
| activ_temp_data_int16[out_c + fc_output_depth * b] = accum; |
| } |
| } |
| |
| // Rest of the LSTM cell: tanh and logistic math functions, and some adds |
| // and muls, all done in 16-bit fixed-point. |
| for (int b = 0; b < outer_size; ++b) { |
| for (int c = 0; c < output_depth; ++c) { |
| // Define the fixed-point data types that we will use here. All use |
| // int16 as the underlying integer type i.e. all are 16-bit fixed-point. |
| // They only differ by the number of integral vs. fractional bits, |
| // determining the range of values that they can represent. |
| // |
| // F0 uses 0 integer bits, range [-1, 1]. |
| // This is the return type of math functions such as tanh, logistic, |
| // whose range is in [-1, 1]. |
| using F0 = gemmlowp::FixedPoint<std::int16_t, 0>; |
| // F3 uses 3 integer bits, range [-8, 8]. |
| // This is the range of the previous fully-connected node's output, |
| // which is our input here. |
| using F3 = gemmlowp::FixedPoint<std::int16_t, 3>; |
| // FS uses StateIntegerBits integer bits, range [-2^StateIntegerBits, |
| // 2^StateIntegerBits]. It's used to represent the internal state, whose |
| // number of integer bits is currently dictated by the model. See comment |
| // on the StateIntegerBits template parameter above. |
| using FS = gemmlowp::FixedPoint<std::int16_t, StateIntegerBits>; |
| // Implementation of input gate, using fixed-point logistic function. |
| F3 input_gate_input = F3::FromRaw( |
| activ_temp_data_int16[b * fc_output_depth + 0 * output_depth + c]); |
| F0 input_gate_output = gemmlowp::logistic(input_gate_input); |
| // Implementation of input modulation gate, using fixed-point tanh |
| // function. |
| F3 input_modulation_gate_input = F3::FromRaw( |
| activ_temp_data_int16[b * fc_output_depth + 1 * output_depth + c]); |
| F0 input_modulation_gate_output = |
| gemmlowp::tanh(input_modulation_gate_input); |
| // Implementation of forget gate, using fixed-point logistic function. |
| F3 forget_gate_input = F3::FromRaw( |
| activ_temp_data_int16[b * fc_output_depth + 2 * output_depth + c]); |
| F0 forget_gate_output = gemmlowp::logistic(forget_gate_input); |
| // Implementation of output gate, using fixed-point logistic function. |
| F3 output_gate_input = F3::FromRaw( |
| activ_temp_data_int16[b * fc_output_depth + 3 * output_depth + c]); |
| F0 output_gate_output = gemmlowp::logistic(output_gate_input); |
| // Implementation of internal multiplication nodes, still in fixed-point. |
| F0 input_times_input_modulation = |
| input_gate_output * input_modulation_gate_output; |
| FS prev_state = FS::FromRaw(prev_state_data_int16[b * output_depth + c]); |
| FS prev_state_times_forget_state = forget_gate_output * prev_state; |
| // Implementation of internal addition node, saturating. |
| FS new_state = gemmlowp::SaturatingAdd( |
| gemmlowp::Rescale<StateIntegerBits>(input_times_input_modulation), |
| prev_state_times_forget_state); |
| // Implementation of last internal Tanh node, still in fixed-point. |
| // Since a Tanh fixed-point implementation is specialized for a given |
| // number or integer bits, and each specialization can have a substantial |
| // code size, and we already used above a Tanh on an input with 3 integer |
| // bits, and per the table in the above function comment there is no |
| // significant accuracy to be lost by clamping to [-8, +8] for a |
| // 3-integer-bits representation, let us just do that. This helps people |
| // porting this to targets where code footprint must be minimized. |
| F3 new_state_f3 = gemmlowp::Rescale<3>(new_state); |
| F0 output_activ_int16 = output_gate_output * gemmlowp::tanh(new_state_f3); |
| // Store the new internal state back to memory, as 16-bit integers. |
| // Note: here we store the original value with StateIntegerBits, not |
| // the rescaled 3-integer-bits value fed to tanh. |
| output_state_data_int16[b * output_depth + c] = new_state.raw(); |
| // Down-scale the output activations to 8-bit integers, saturating, |
| // and store back to memory. |
| int16 rescaled_output_activ = |
| gemmlowp::RoundingDivideByPOT(output_activ_int16.raw(), 8); |
| int16 clamped_output_activ = |
| std::max<int16>(-128, std::min<int16>(127, rescaled_output_activ)); |
| output_activ_data_uint8[b * output_depth + c] = |
| 128 + clamped_output_activ; |
| } |
| } |
| } |
| |
| // TODO(b/80418076): Move to legacy ops file, update invocations. |
| // Legacy. |
| template <int StateIntegerBits> |
| void LstmCell(const uint8* input_data_uint8, const Dims<4>& input_dims, |
| const uint8* prev_activ_data_uint8, |
| const Dims<4>& prev_activ_dims, const uint8* weights_data_uint8, |
| const Dims<4>& weights_dims, const int32* bias_data_int32, |
| const Dims<4>& bias_dims, const int16* prev_state_data_int16, |
| const Dims<4>& prev_state_dims, int16* output_state_data_int16, |
| const Dims<4>& output_state_dims, uint8* output_activ_data_uint8, |
| const Dims<4>& output_activ_dims, uint8* concat_temp_data_uint8, |
| const Dims<4>& concat_temp_dims, int16* activ_temp_data_int16, |
| const Dims<4>& activ_temp_dims, int32 weights_zero_point, |
| int32 accum_multiplier, int accum_shift, |
| gemmlowp::GemmContext* gemm_context) { |
| tflite::LstmCellParams op_params; |
| op_params.weights_zero_point = weights_zero_point; |
| op_params.accum_multiplier = accum_multiplier; |
| op_params.accum_shift = accum_shift; |
| |
| LstmCell<StateIntegerBits>( |
| op_params, DimsToShape(input_dims), input_data_uint8, |
| DimsToShape(prev_activ_dims), prev_activ_data_uint8, |
| DimsToShape(weights_dims), weights_data_uint8, DimsToShape(bias_dims), |
| bias_data_int32, DimsToShape(prev_state_dims), prev_state_data_int16, |
| DimsToShape(output_state_dims), output_state_data_int16, |
| DimsToShape(output_activ_dims), output_activ_data_uint8, |
| DimsToShape(concat_temp_dims), concat_temp_data_uint8, |
| DimsToShape(activ_temp_dims), activ_temp_data_int16, gemm_context); |
| } |
| |
| template <typename Scalar> |
| void Split(const SplitParams& params, const RuntimeShape& input_shape, |
| const Scalar* input_data, const RuntimeShape* const* output_shapes, |
| Scalar* const* output_data) { |
| const int concat_dimensions = input_shape.DimensionsCount(); |
| int axis = params.axis < 0 ? params.axis + concat_dimensions : params.axis; |
| int outputs_count = params.num_split; |
| TFLITE_DCHECK_LT(axis, concat_dimensions); |
| |
| int64_t concat_size = 0; |
| for (int i = 0; i < outputs_count; i++) { |
| TFLITE_DCHECK_EQ(output_shapes[i]->DimensionsCount(), concat_dimensions); |
| for (int j = 0; j < concat_dimensions; j++) { |
| if (j != axis) { |
| MatchingDim(*output_shapes[i], j, input_shape, j); |
| } |
| } |
| concat_size += output_shapes[i]->Dims(axis); |
| } |
| TFLITE_DCHECK_EQ(concat_size, input_shape.Dims(axis)); |
| int64_t outer_size = 1; |
| for (int i = 0; i < axis; ++i) { |
| outer_size *= input_shape.Dims(i); |
| } |
| // For all output arrays, |
| // FlatSize() = outer_size * Dims(axis) * base_inner_size; |
| int64_t base_inner_size = 1; |
| for (int i = axis + 1; i < concat_dimensions; ++i) { |
| base_inner_size *= input_shape.Dims(i); |
| } |
| |
| const Scalar* input_ptr = input_data; |
| for (int k = 0; k < outer_size; k++) { |
| for (int i = 0; i < outputs_count; ++i) { |
| const int copy_size = output_shapes[i]->Dims(axis) * base_inner_size; |
| memcpy(output_data[i] + k * copy_size, input_ptr, |
| copy_size * sizeof(Scalar)); |
| input_ptr += copy_size; |
| } |
| } |
| } |
| |
| // TODO(b/80418076): Move to legacy ops file, update invocations. |
| // Legacy Dims<4>. |
| template <typename Scalar> |
| void TensorFlowSplit(const Scalar* input_data, const Dims<4>& input_dims, |
| int axis, int outputs_count, Scalar* const* output_data, |
| const Dims<4>* const* output_dims) { |
| std::vector<RuntimeShape> output_shapes(outputs_count); |
| std::vector<const RuntimeShape*> output_shapes_indirect(outputs_count); |
| for (int i = 0; i < outputs_count; ++i) { |
| ShapeFromDims(*output_dims[i], &output_shapes[i]); |
| output_shapes_indirect[i] = &output_shapes[i]; |
| } |
| tflite::SplitParams op_params; |
| op_params.axis = 3 - axis; |
| op_params.num_split = outputs_count; |
| |
| Split(op_params, DimsToShape(input_dims), input_data, |
| output_shapes_indirect.data(), output_data); |
| } |
| |
| // TODO(b/80418076): Move to legacy ops file, update invocations. |
| // Legacy Dims<4>. |
| template <FusedActivationFunctionType Ac, typename Scalar> |
| void TensorFlowSplit(const Scalar* input_data, const Dims<4>& input_dims, |
| int outputs_count, Scalar* const* output_data, |
| const Dims<4>* const* output_dims) { |
| TFLITE_DCHECK_GE(outputs_count, 1); |
| for (int i = 0; i < outputs_count; i++) { |
| /* batches = */ MatchingArraySize(*output_dims[i], 3, input_dims, 3); |
| /* height = */ MatchingArraySize(*output_dims[i], 2, input_dims, 2); |
| /* width = */ MatchingArraySize(*output_dims[i], 1, input_dims, 1); |
| } |
| // For now we don't have a model with a Split with fused activation. |
| TFLITE_DCHECK_EQ(Ac, FusedActivationFunctionType::kNone); |
| |
| TensorFlowSplit(input_data, input_dims, /*axis=*/0, outputs_count, |
| output_data, output_dims); |
| } |
| |
| inline int NodeOffset(int b, int h, int w, int height, int width) { |
| return (b * height + h) * width + w; |
| } |
| |
| inline void AveragePool(const PoolParams& params, |
| const RuntimeShape& input_shape, |
| const float* input_data, |
| const RuntimeShape& output_shape, float* output_data) { |
| TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); |
| const int batches = MatchingDim(input_shape, 0, output_shape, 0); |
| const int depth = MatchingDim(input_shape, 3, output_shape, 3); |
| const int input_height = input_shape.Dims(1); |
| const int input_width = input_shape.Dims(2); |
| const int output_height = output_shape.Dims(1); |
| const int output_width = output_shape.Dims(2); |
| const int stride_height = params.stride_height; |
| const int stride_width = params.stride_width; |
| for (int batch = 0; batch < batches; ++batch) { |
| for (int out_y = 0; out_y < output_height; ++out_y) { |
| for (int out_x = 0; out_x < output_width; ++out_x) { |
| for (int channel = 0; channel < depth; ++channel) { |
| const int in_x_origin = |
| (out_x * stride_width) - params.padding_values.width; |
| const int in_y_origin = |
| (out_y * stride_height) - params.padding_values.height; |
| // Compute the boundaries of the filter region clamped so as to |
| // ensure that the filter window fits in the input array. |
| const int filter_x_start = std::max(0, -in_x_origin); |
| const int filter_x_end = |
| std::min(params.filter_width, input_width - in_x_origin); |
| const int filter_y_start = std::max(0, -in_y_origin); |
| const int filter_y_end = |
| std::min(params.filter_height, input_height - in_y_origin); |
| float total = 0.f; |
| float filter_count = 0; |
| for (int filter_y = filter_y_start; filter_y < filter_y_end; |
| ++filter_y) { |
| for (int filter_x = filter_x_start; filter_x < filter_x_end; |
| ++filter_x) { |
| const int in_x = in_x_origin + filter_x; |
| const int in_y = in_y_origin + filter_y; |
| total += |
| input_data[Offset(input_shape, batch, in_y, in_x, channel)]; |
| filter_count++; |
| } |
| } |
| const float average = total / filter_count; |
| output_data[Offset(output_shape, batch, out_y, out_x, channel)] = |
| ActivationFunctionWithMinMax(average, params.float_activation_min, |
| params.float_activation_max); |
| } |
| } |
| } |
| } |
| } |
| |
| inline void AveragePool(const PoolParams& params, |
| const RuntimeShape& input_shape, |
| const uint8* input_data, |
| const RuntimeShape& output_shape, uint8* output_data) { |
| TFLITE_DCHECK_LE(params.quantized_activation_min, |
| params.quantized_activation_max); |
| TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); |
| const int batches = MatchingDim(input_shape, 0, output_shape, 0); |
| const int depth = MatchingDim(input_shape, 3, output_shape, 3); |
| const int input_height = input_shape.Dims(1); |
| const int input_width = input_shape.Dims(2); |
| const int output_height = output_shape.Dims(1); |
| const int output_width = output_shape.Dims(2); |
| const int stride_height = params.stride_height; |
| const int stride_width = params.stride_width; |
| for (int batch = 0; batch < batches; ++batch) { |
| for (int out_y = 0; out_y < output_height; ++out_y) { |
| for (int out_x = 0; out_x < output_width; ++out_x) { |
| for (int channel = 0; channel < depth; ++channel) { |
| const int in_x_origin = |
| (out_x * stride_width) - params.padding_values.width; |
| const int in_y_origin = |
| (out_y * stride_height) - params.padding_values.height; |
| // Compute the boundaries of the filter region clamped so as to |
| // ensure that the filter window fits in the input array. |
| const int filter_x_start = std::max(0, -in_x_origin); |
| const int filter_x_end = |
| std::min(params.filter_width, input_width - in_x_origin); |
| const int filter_y_start = std::max(0, -in_y_origin); |
| const int filter_y_end = |
| std::min(params.filter_height, input_height - in_y_origin); |
| int32 acc = 0; |
| int filter_count = 0; |
| for (int filter_y = filter_y_start; filter_y < filter_y_end; |
| ++filter_y) { |
| for (int filter_x = filter_x_start; filter_x < filter_x_end; |
| ++filter_x) { |
| const int in_x = in_x_origin + filter_x; |
| const int in_y = in_y_origin + filter_y; |
| acc += |
| input_data[Offset(input_shape, batch, in_y, in_x, channel)]; |
| filter_count++; |
| } |
| } |
| acc = (acc + filter_count / 2) / filter_count; |
| acc = std::max(acc, params.quantized_activation_min); |
| acc = std::min(acc, params.quantized_activation_max); |
| output_data[Offset(output_shape, batch, out_y, out_x, channel)] = |
| static_cast<uint8>(acc); |
| } |
| } |
| } |
| } |
| } |
| |
| inline void L2Pool(const PoolParams& params, const RuntimeShape& input_shape, |
| const float* input_data, const RuntimeShape& output_shape, |
| float* output_data) { |
| TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); |
| const int batches = MatchingDim(input_shape, 0, output_shape, 0); |
| const int depth = MatchingDim(input_shape, 3, output_shape, 3); |
| const int input_height = input_shape.Dims(1); |
| const int input_width = input_shape.Dims(2); |
| const int output_height = output_shape.Dims(1); |
| const int output_width = output_shape.Dims(2); |
| const int stride_height = params.stride_height; |
| const int stride_width = params.stride_width; |
| for (int batch = 0; batch < batches; ++batch) { |
| for (int out_y = 0; out_y < output_height; ++out_y) { |
| for (int out_x = 0; out_x < output_width; ++out_x) { |
| for (int channel = 0; channel < depth; ++channel) { |
| const int in_x_origin = |
| (out_x * stride_width) - params.padding_values.width; |
| const int in_y_origin = |
| (out_y * stride_height) - params.padding_values.height; |
| // Compute the boundaries of the filter region clamped so as to |
| // ensure that the filter window fits in the input array. |
| const int filter_x_start = std::max(0, -in_x_origin); |
| const int filter_x_end = |
| std::min(params.filter_width, input_width - in_x_origin); |
| const int filter_y_start = std::max(0, -in_y_origin); |
| const int filter_y_end = |
| std::min(params.filter_height, input_height - in_y_origin); |
| float sum_squares = 0.f; |
| int filter_count = 0; |
| for (int filter_y = filter_y_start; filter_y < filter_y_end; |
| ++filter_y) { |
| for (int filter_x = filter_x_start; filter_x < filter_x_end; |
| ++filter_x) { |
| const int in_x = in_x_origin + filter_x; |
| const int in_y = in_y_origin + filter_y; |
| const float val = |
| input_data[Offset(input_shape, batch, in_y, in_x, channel)]; |
| sum_squares += val * val; |
| filter_count++; |
| } |
| } |
| const float l2pool_result = std::sqrt(sum_squares / filter_count); |
| output_data[Offset(output_shape, batch, out_y, out_x, channel)] = |
| ActivationFunctionWithMinMax(l2pool_result, |
| params.float_activation_min, |
| params.float_activation_max); |
| } |
| } |
| } |
| } |
| } |
| |
| inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape, |
| const float* input_data, const RuntimeShape& output_shape, |
| float* output_data) { |
| TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); |
| const int batches = MatchingDim(input_shape, 0, output_shape, 0); |
| const int depth = MatchingDim(input_shape, 3, output_shape, 3); |
| const int input_height = input_shape.Dims(1); |
| const int input_width = input_shape.Dims(2); |
| const int output_height = output_shape.Dims(1); |
| const int output_width = output_shape.Dims(2); |
| const int stride_height = params.stride_height; |
| const int stride_width = params.stride_width; |
| for (int batch = 0; batch < batches; ++batch) { |
| for (int out_y = 0; out_y < output_height; ++out_y) { |
| for (int out_x = 0; out_x < output_width; ++out_x) { |
| for (int channel = 0; channel < depth; ++channel) { |
| const int in_x_origin = |
| (out_x * stride_width) - params.padding_values.width; |
| const int in_y_origin = |
| (out_y * stride_height) - params.padding_values.height; |
| // Compute the boundaries of the filter region clamped so as to |
| // ensure that the filter window fits in the input array. |
| const int filter_x_start = std::max(0, -in_x_origin); |
| const int filter_x_end = |
| std::min(params.filter_width, input_width - in_x_origin); |
| const int filter_y_start = std::max(0, -in_y_origin); |
| const int filter_y_end = |
| std::min(params.filter_height, input_height - in_y_origin); |
| float max = std::numeric_limits<float>::lowest(); |
| for (int filter_y = filter_y_start; filter_y < filter_y_end; |
| ++filter_y) { |
| for (int filter_x = filter_x_start; filter_x < filter_x_end; |
| ++filter_x) { |
| const int in_x = in_x_origin + filter_x; |
| const int in_y = in_y_origin + filter_y; |
| max = std::max( |
| max, |
| input_data[Offset(input_shape, batch, in_y, in_x, channel)]); |
| } |
| } |
| output_data[Offset(output_shape, batch, out_y, out_x, channel)] = |
| ActivationFunctionWithMinMax(max, params.float_activation_min, |
| params.float_activation_max); |
| } |
| } |
| } |
| } |
| } |
| |
| inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape, |
| const uint8* input_data, const RuntimeShape& output_shape, |
| uint8* output_data) { |
| TFLITE_DCHECK_LE(params.quantized_activation_min, |
| params.quantized_activation_max); |
| TFLITE_DCHECK_GE(params.quantized_activation_min, 0); |
| TFLITE_DCHECK_LE(params.quantized_activation_max, 255); |
| TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); |
| const int batches = MatchingDim(input_shape, 0, output_shape, 0); |
| const int depth = MatchingDim(input_shape, 3, output_shape, 3); |
| const int input_height = input_shape.Dims(1); |
| const int input_width = input_shape.Dims(2); |
| const int output_height = output_shape.Dims(1); |
| const int output_width = output_shape.Dims(2); |
| const int stride_height = params.stride_height; |
| const int stride_width = params.stride_width; |
| for (int batch = 0; batch < batches; ++batch) { |
| for (int out_y = 0; out_y < output_height; ++out_y) { |
| for (int out_x = 0; out_x < output_width; ++out_x) { |
| for (int channel = 0; channel < depth; ++channel) { |
| const int in_x_origin = |
| (out_x * stride_width) - params.padding_values.width; |
| const int in_y_origin = |
| (out_y * stride_height) - params.padding_values.height; |
| // Compute the boundaries of the filter region clamped so as to |
| // ensure that the filter window fits in the input array. |
| const int filter_x_start = std::max(0, -in_x_origin); |
| const int filter_x_end = |
| std::min(params.filter_width, input_width - in_x_origin); |
| const int filter_y_start = std::max(0, -in_y_origin); |
| const int filter_y_end = |
| std::min(params.filter_height, input_height - in_y_origin); |
| uint8 max = 0; |
| for (int filter_y = filter_y_start; filter_y < filter_y_end; |
| ++filter_y) { |
| for (int filter_x = filter_x_start; filter_x < filter_x_end; |
| ++filter_x) { |
| const int in_x = in_x_origin + filter_x; |
| const int in_y = in_y_origin + filter_y; |
| max = std::max( |
| max, |
| input_data[Offset(input_shape, batch, in_y, in_x, channel)]); |
| } |
| } |
| max = std::max<uint8>(max, params.quantized_activation_min); |
| max = std::min<uint8>(max, params.quantized_activation_max); |
| output_data[Offset(output_shape, batch, out_y, out_x, channel)] = |
| static_cast<uint8>(max); |
| } |
| } |
| } |
| } |
| } |
| |
| inline void LocalResponseNormalization( |
| const tflite::LocalResponseNormalizationParams& op_params, |
| const RuntimeShape& input_shape, const float* input_data, |
| const RuntimeShape& output_shape, float* output_data) { |
| const int trailing_dim = input_shape.DimensionsCount() - 1; |
| const int outer_size = |
| MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); |
| const int depth = |
| MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); |
| |
| for (int i = 0; i < outer_size; ++i) { |
| for (int c = 0; c < depth; ++c) { |
| const int begin_input_c = std::max(0, c - op_params.range); |
| const int end_input_c = std::min(depth, c + op_params.range); |
| float accum = 0.f; |
| for (int input_c = begin_input_c; input_c < end_input_c; ++input_c) { |
| const float input_val = input_data[i * depth + input_c]; |
| accum += input_val * input_val; |
| } |
| const float multiplier = |
| std::pow(op_params.bias + op_params.alpha * accum, -op_params.beta); |
| output_data[i * depth + c] = input_data[i * depth + c] * multiplier; |
| } |
| } |
| } |
| |
| inline void Softmax(const SoftmaxParams& params, |
| const RuntimeShape& input_shape, const float* input_data, |
| const RuntimeShape& output_shape, float* output_data) { |
| const int trailing_dim = input_shape.DimensionsCount() - 1; |
| const int outer_size = |
| MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); |
| const int depth = |
| MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); |
| |
| for (int i = 0; i < outer_size; ++i) { |
| // Find max element value which we'll use to ensure numerical stability |
| // taking advantage of the following equality: |
| // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C)) |
| float max = std::numeric_limits<float>::lowest(); |
| for (int c = 0; c < depth; ++c) { |
| max = std::max(max, input_data[i * depth + c]); |
| } |
| |
| // Compute sum. |
| float sum = 0.f; |
| for (int c = 0; c < depth; ++c) { |
| sum += std::exp((input_data[i * depth + c] - max) * params.beta); |
| } |
| |
| // Compute result. |
| for (int c = 0; c < depth; ++c) { |
| output_data[i * depth + c] = |
| std::exp((input_data[i * depth + c] - max) * params.beta) / sum; |
| } |
| } |
| } |
| |
| // TODO(b/80418076): Move to legacy ops file, update invocations. |
| // Legacy. |
| inline void Softmax(const float* input_data, const RuntimeShape& input_shape, |
| float beta, float* output_data, |
| const RuntimeShape& output_shape) { |
| SoftmaxParams params; |
| params.beta = beta; |
| Softmax(params, input_shape, input_data, output_shape, output_data); |
| } |
| |
| inline void Softmax(const SoftmaxParams& params, |
| const RuntimeShape& input_shape, const uint8* input_data, |
| const RuntimeShape& output_shape, uint8* output_data) { |
| const int32 input_beta_multiplier = params.input_multiplier; |
| const int32 input_beta_left_shift = params.input_left_shift; |
| const int diff_min = params.diff_min; |
| // The representation chosen for the input to the exp() function is Q5.26. |
| // We need to leave extra space since values that we skip might be as large as |
| // -32 before multiplying by input_beta_multiplier, and therefore as large as |
| // -16 afterwards. Note that exp(-8) is definitely not insignificant to |
| // accumulation, but exp(-16) definitely is. |
| static const int kScaledDiffIntegerBits = 5; |
| static const int kAccumulationIntegerBits = 12; |
| using FixedPointScaledDiff = |
| gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>; |
| using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>; |
| using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>; |
| |
| const int trailing_dim = input_shape.DimensionsCount() - 1; |
| const int outer_size = |
| MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); |
| const int depth = |
| MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); |
| |
| for (int i = 0; i < outer_size; ++i) { |
| uint8 max_in_row = 0; |
| for (int c = 0; c < depth; ++c) { |
| max_in_row = std::max(max_in_row, input_data[i * depth + c]); |
| } |
| |
| FixedPointAccum sum_of_exps = FixedPointAccum::Zero(); |
| for (int c = 0; c < depth; ++c) { |
| int32 input_diff = |
| static_cast<int32>(input_data[i * depth + c]) - max_in_row; |
| if (input_diff >= diff_min) { |
| const int32 input_diff_rescaled = |
| MultiplyByQuantizedMultiplierGreaterThanOne( |
| input_diff, input_beta_multiplier, input_beta_left_shift); |
| const FixedPointScaledDiff scaled_diff_f8 = |
| FixedPointScaledDiff::FromRaw(input_diff_rescaled); |
| sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>( |
| exp_on_negative_values(scaled_diff_f8)); |
| } |
| } |
| |
| int32 fixed_sum_of_exps = sum_of_exps.raw(); |
| int headroom_plus_one = |
| CountLeadingZeros(static_cast<uint32>(fixed_sum_of_exps)); |
| // This is the number of bits to the left of the binary point above 1.0. |
| // Consider fixed_sum_of_exps=1.25. In that case shifted_scale=0.8 and |
| // no later adjustment will be needed. |
| int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one; |
| int32 shifted_sum_minus_one = static_cast<int32>( |
| (static_cast<uint32>(fixed_sum_of_exps) << headroom_plus_one) - |
| (static_cast<uint32>(1) << 31)); |
| |
| FixedPoint0 shifted_scale = gemmlowp::one_over_one_plus_x_for_x_in_0_1( |
| FixedPoint0::FromRaw(shifted_sum_minus_one)); |
| |
| for (int c = 0; c < depth; ++c) { |
| int32 input_diff = |
| static_cast<int32>(input_data[i * depth + c]) - max_in_row; |
| if (input_diff >= diff_min) { |
| const int32 input_diff_rescaled = |
| MultiplyByQuantizedMultiplierGreaterThanOne( |
| input_diff, input_beta_multiplier, input_beta_left_shift); |
| const FixedPointScaledDiff scaled_diff_f8 = |
| FixedPointScaledDiff::FromRaw(input_diff_rescaled); |
| |
| FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8); |
| int32 unsat_output = gemmlowp::RoundingDivideByPOT( |
| (shifted_scale * exp_in_0).raw(), num_bits_over_unit + 31 - 8); |
| |
| output_data[i * depth + c] = static_cast<uint8>( |
| std::max(std::min(unsat_output, static_cast<int32>(255)), 0)); |
| |
| } else { |
| output_data[i * depth + c] = 0; |
| } |
| } |
| } |
| } |
| |
| // TODO(b/80418076): Move to legacy ops file, update invocations. |
| // Legacy |
| inline void Softmax(const uint8* input_data, const RuntimeShape& input_shape, |
| int32 input_beta_multiplier, int32 input_beta_left_shift, |
| int diff_min, uint8* output_data, |
| const RuntimeShape& output_shape) { |
| SoftmaxParams params; |
| params.input_multiplier = input_beta_multiplier; |
| params.input_left_shift = input_beta_left_shift; |
| params.diff_min = diff_min; |
| Softmax(params, input_shape, input_data, output_shape, output_data); |
| } |
| |
| inline void LogSoftmax(const SoftmaxParams& params, |
| const RuntimeShape& input_shape, const float* input_data, |
| const RuntimeShape& output_shape, float* output_data) { |
| const int trailing_dim = input_shape.DimensionsCount() - 1; |
| const int outer_size = |
| MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); |
| const int depth = |
| MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); |
| |
| for (int i = 0; i < outer_size; ++i) { |
| // Find max element value which we'll use to ensure numerical stability |
| // taking advantage of the following equality: |
| // log(exp(x[i])/sum(exp(x[i]))) == log(exp(x[i]+C)/sum(exp(x[i]+C))) |
| float max = std::numeric_limits<float>::lowest(); |
| for (int c = 0; c < depth; ++c) { |
| max = std::max(max, input_data[i * depth + c]); |
| } |
| |
| // Compute sum. |
| float sum = 0.f; |
| for (int c = 0; c < depth; ++c) { |
| sum += std::exp(input_data[i * depth + c] - max); |
| } |
| |
| // Compute result. |
| const float log_sum = std::log(sum); |
| for (int c = 0; c < depth; ++c) { |
| output_data[i * depth + c] = input_data[i * depth + c] - max - log_sum; |
| } |
| } |
| } |
| |
| // TODO(b/80418076): Move to legacy ops file, update invocations. |
| // Legacy |
| inline void LogSoftmax(const float* input_data, const RuntimeShape& input_shape, |
| float* output_data, const RuntimeShape& output_shape) { |
| SoftmaxParams params; |
| // No params currently used for float LogSoftmax. |
| LogSoftmax(params, input_shape, input_data, output_shape, output_data); |
| } |
| |
| // Although currently the name of this function says that it cannot handle |
| // values less than 1, in practice it can handle as low as 1/x_max, where |
| // x_max is the largest representable input. In other words, the output range |
| // is symmetric. |
| template <int OutputIntegerBits, int InputIntegerBits> |
| inline gemmlowp::FixedPoint<int32, OutputIntegerBits> |
| log_x_for_x_greater_than_or_equal_to_1_impl( |
| gemmlowp::FixedPoint<int32, InputIntegerBits> input_val) { |
| using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>; |
| // The reason for accumulating the result with an extra bit of headroom is |
| // that z_pow_2_adj * log_2 might be saturated, and adding num_scaled * |
| // recip_denom will otherwise introduce an error. |
| static constexpr int kAccumIntegerBits = OutputIntegerBits + 1; |
| using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumIntegerBits>; |
| |
| const FixedPoint0 log_2 = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT( |
| FixedPoint0, 1488522236, std::log(2.0)); |
| const FixedPoint0 sqrt_sqrt_half = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT( |
| FixedPoint0, 1805811301, std::sqrt(std::sqrt(0.5))); |
| const FixedPoint0 sqrt_half = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT( |
| FixedPoint0, 1518500250, std::sqrt(0.5)); |
| const FixedPoint0 one_quarter = |
| GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(FixedPoint0, 536870912, 1.0 / 4.0); |
| |
| const FixedPoint0 alpha_n = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT( |
| FixedPoint0, 117049297, 11.0 / 240.0 * std::sqrt(std::sqrt(2.0))); |
| const FixedPoint0 alpha_d = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT( |
| FixedPoint0, 127690142, 1.0 / 20.0 * std::sqrt(std::sqrt(2.0))); |
| const FixedPoint0 alpha_i = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT( |
| FixedPoint0, 1057819769, |
| 2.0 / std::sqrt(std::sqrt(2.0)) - std::sqrt(std::sqrt(2.0))); |
| const FixedPoint0 alpha_f = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT( |
| FixedPoint0, 638450708, 1.0 / 4.0 * std::sqrt(std::sqrt(2.0))); |
| |
| const FixedPointAccum shifted_quarter = |
| gemmlowp::Rescale<kAccumIntegerBits>(one_quarter); |
| |
| // Reinterpret the input value as Q0.31, because we will figure out the |
| // required shift "ourselves" instead of using, say, Rescale. |
| FixedPoint0 z_a = FixedPoint0::FromRaw(input_val.raw()); |
| // z_a_pow_2 = input_integer_bits - z_a_headroom; |
| int z_a_headroom_plus_1 = CountLeadingZeros(static_cast<uint32>(z_a.raw())); |
| FixedPoint0 r_a_tmp = |
| SaturatingRoundingMultiplyByPOTParam(z_a, (z_a_headroom_plus_1 - 1)); |
| const int32 r_a_raw = |
| SaturatingRoundingMultiplyByPOTParam((r_a_tmp * sqrt_half).raw(), 1); |
| // z_pow_2_adj = max(z_pow_2_a - 0.75, z_pow_2_b - 0.25); |
| // z_pow_2_adj = max(InputIntegerBits - z_a_headroom_plus_1 + 0.25, |
| // InputIntegerBits - z_b_headroom - 0.25); |
| const FixedPointAccum z_a_pow_2_adj = SaturatingAddNonGemmlowp( |
| FixedPointAccum::FromRaw(SaturatingRoundingMultiplyByPOTParam( |
| InputIntegerBits - z_a_headroom_plus_1, 31 - kAccumIntegerBits)), |
| shifted_quarter); |
| |
| // z_b is treated like z_a, but premultiplying by sqrt(0.5). |
| FixedPoint0 z_b = z_a * sqrt_half; |
| int z_b_headroom = CountLeadingZeros(static_cast<uint32>(z_b.raw())) - 1; |
| const int32 r_b_raw = |
| SaturatingRoundingMultiplyByPOTParam(z_a.raw(), z_b_headroom); |
| const FixedPointAccum z_b_pow_2_adj = SaturatingSub( |
| FixedPointAccum::FromRaw(SaturatingRoundingMultiplyByPOTParam( |
| InputIntegerBits - z_b_headroom, 31 - kAccumIntegerBits)), |
| shifted_quarter); |
| |
| const FixedPoint0 r = FixedPoint0::FromRaw(std::min(r_a_raw, r_b_raw)); |
| const FixedPointAccum z_pow_2_adj = FixedPointAccum::FromRaw( |
| std::max(z_a_pow_2_adj.raw(), z_b_pow_2_adj.raw())); |
| |
| const FixedPoint0 p = gemmlowp::RoundingHalfSum(r, sqrt_sqrt_half); |
| FixedPoint0 q = r - sqrt_sqrt_half; |
| q = q + q; |
| |
| const FixedPoint0 common_sq = q * q; |
| const FixedPoint0 num = q * r + q * common_sq * alpha_n; |
| const FixedPoint0 denom_minus_one_0 = |
| p * (alpha_i + q + alpha_d * common_sq) + alpha_f * q; |
| const FixedPoint0 recip_denom = |
| one_over_one_plus_x_for_x_in_0_1(denom_minus_one_0); |
| |
| const FixedPointAccum num_scaled = gemmlowp::Rescale<kAccumIntegerBits>(num); |
| return gemmlowp::Rescale<OutputIntegerBits>(z_pow_2_adj * log_2 + |
| num_scaled * recip_denom); |
| } |
| |
| // Minimum output bits to accommodate log of maximum input range. It actually |
| // does not matter if one considers, say, [-64,64] or [-64,64). |
| // |
| // For example, run this through Octave: |
| // [0:127; ... |
| // ceil(log(abs( log(2.^(0:127))+1 ))/log(2)); ... |
| // ceil(log(abs( log(2.^(0:127))+1 ))/log(2))] |
| constexpr int min_log_x_output_bits(int input_bits) { |
| return input_bits > 90 |
| ? 7 |
| : input_bits > 44 |
| ? 6 |
| : input_bits > 21 |
| ? 5 |
| : input_bits > 10 |
| ? 4 |
| : input_bits > 4 ? 3 : input_bits > 1 ? 2 : 1; |
| } |
| |
| template <int OutputIntegerBits, int InputIntegerBits> |
| inline gemmlowp::FixedPoint<int32, OutputIntegerBits> |
| log_x_for_x_greater_than_or_equal_to_1( |
| gemmlowp::FixedPoint<int32, InputIntegerBits> input_val) { |
| static_assert( |
| OutputIntegerBits >= min_log_x_output_bits(InputIntegerBits), |
| "Output integer bits must be sufficent to accommodate logs of inputs."); |
| return log_x_for_x_greater_than_or_equal_to_1_impl<OutputIntegerBits, |
| InputIntegerBits>( |
| input_val); |
| } |
| |
| inline void LogSoftmax(const SoftmaxParams& params, |
| const RuntimeShape& input_shape, const uint8* input_data, |
| const RuntimeShape& output_shape, uint8* output_data) { |
| const int32 input_multiplier = params.input_multiplier; |
| const int32 input_left_shift = params.input_left_shift; |
| const int32 reverse_scaling_divisor = params.reverse_scaling_divisor; |
| const int32 reverse_scaling_right_shift = params.reverse_scaling_right_shift; |
| const int diff_min = params.diff_min; |
| // The representation chosen for the input to the exp() function is Q5.26. |
| // We need to leave extra space since values that we skip might be as large |
| // as -32 before multiplying by input_beta_multiplier, and therefore as |
| // large as -16 afterwards. Note that exp(-8) is definitely not |
| // insignificant to accumulation, but exp(-16) definitely is. |
| static constexpr int kScaledDiffIntegerBits = 5; |
| static constexpr int kAccumulationIntegerBits = 12; |
| static constexpr int kOutputIntegerBits = 4; |
| using FixedPointScaledDiff = |
| gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>; |
| using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>; |
| using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>; |
| |
| const int trailing_dim = input_shape.DimensionsCount() - 1; |
| const int outer_size = |
| MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); |
| const int depth = |
| MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); |
| |
| for (int i = 0; i < outer_size; ++i) { |
| uint8 max_in_row = 0; |
| for (int c = 0; c < depth; ++c) { |
| max_in_row = std::max(max_in_row, input_data[i * depth + c]); |
| } |
| |
| FixedPointAccum sum_of_exps = FixedPointAccum::Zero(); |
| for (int c = 0; c < depth; ++c) { |
| int32 input_diff = |
| static_cast<int32>(input_data[i * depth + c]) - max_in_row; |
| if (input_diff >= diff_min) { |
| const int32 input_diff_rescaled = |
| MultiplyByQuantizedMultiplierGreaterThanOne( |
| input_diff, input_multiplier, input_left_shift); |
| const FixedPointScaledDiff scaled_diff_f8 = |
| FixedPointScaledDiff::FromRaw(input_diff_rescaled); |
| sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>( |
| exp_on_negative_values(scaled_diff_f8)); |
| } |
| } |
| |
| const int32 fixed_log_sum_of_exps = |
| log_x_for_x_greater_than_or_equal_to_1<kScaledDiffIntegerBits>( |
| sum_of_exps) |
| .raw(); |
| |
| // rescaled_diff_min is smallest representable in |
| // Q(kScaledDiffIntegerBits).(31-kScaledDiffIntegerBits) plus the |
| // log-sub-exps that will be subtracted in the loop. |
| // |
| // The thresholds diff_min, etc are negative. |
| const int rescaled_diff_min = |
| fixed_log_sum_of_exps + std::numeric_limits<int32>::lowest(); |
| const int adjusted_diff_min = |
| std::max(diff_min - 1, // Note use of > below instead of >= above. |
| MultiplyByQuantizedMultiplierSmallerThanOneExp( |
| rescaled_diff_min, reverse_scaling_divisor, |
| -reverse_scaling_right_shift)); |
| |
| for (int c = 0; c < depth; ++c) { |
| int32 input_diff = |
| static_cast<int32>(input_data[i * depth + c]) - max_in_row; |
| if (input_diff > adjusted_diff_min) { |
| const int32 input_diff_rescaled = |
| MultiplyByQuantizedMultiplierGreaterThanOne( |
| input_diff, input_multiplier, input_left_shift); |
| int32 unsat_output = |
| gemmlowp::RoundingDivideByPOT( |
| (input_diff_rescaled - fixed_log_sum_of_exps), |
| 31 - kScaledDiffIntegerBits - kOutputIntegerBits) + |
| 255; |
| |
| output_data[i * depth + c] = static_cast<uint8>( |
| std::max(std::min(unsat_output, static_cast<int32>(255)), 0)); |
| } else { |
| // Set output to smallest value. |
| output_data[i * depth + c] = 0; |
| } |
| } |
| } |
| } |
| |
| // TODO(b/80418076): Move to legacy ops file, update invocations. |
| // Legacy. |
| inline void LogSoftmax(const uint8* input_data, const RuntimeShape& input_shape, |
| int32 input_multiplier, int32 input_left_shift, |
| int32 reverse_scaling_divisor, |
| int32 reverse_scaling_right_shift, int diff_min, |
| uint8* output_data, const RuntimeShape& output_shape) { |
| SoftmaxParams params; |
| params.input_multiplier = input_multiplier; |
| params.input_left_shift = input_left_shift; |
| params.reverse_scaling_divisor = reverse_scaling_divisor; |
| params.reverse_scaling_right_shift = reverse_scaling_right_shift; |
| params.diff_min = diff_min; |
| LogSoftmax(params, input_shape, input_data, output_shape, output_data); |
| } |
| |
| inline void Logistic(const RuntimeShape& input_shape, const float* input_data, |
| const RuntimeShape& output_shape, float* output_data) { |
| const int flat_size = MatchingFlatSize(input_shape, output_shape); |
| |
| for (int i = 0; i < flat_size; i++) { |
| float val = input_data[i]; |
| float result = 1.f / (1.f + std::exp(-val)); |
| output_data[i] = result; |
| } |
| } |
| |
| // Convenience version that allows, for example, generated-code calls to be |
| // uniform between data types. |
| inline void Logistic(const LogisticParams&, const RuntimeShape& input_shape, |
| const float* input_data, const RuntimeShape& output_shape, |
| float* output_data) { |
| // Drop params: not needed. |
| Logistic(input_shape, input_data, output_shape, output_data); |
| } |
| |
| inline void Logistic(const LogisticParams& params, |
| const RuntimeShape& input_shape, const uint8* input_data, |
| const RuntimeShape& output_shape, uint8* output_data) { |
| const int32 input_zero_point = params.input_zero_point; |
| const int32 input_range_radius = params.input_range_radius; |
| const int32 input_multiplier = params.input_multiplier; |
| const int input_left_shift = params.input_left_shift; |
| const int flat_size = MatchingFlatSize(input_shape, output_shape); |
| |
| for (int i = 0; i < flat_size; i++) { |
| const uint8 input_val_u8 = input_data[i]; |
| const int32 input_val_centered = |
| static_cast<int32>(input_val_u8) - input_zero_point; |
| uint8 output_val; |
| if (input_val_centered <= -input_range_radius) { |
| output_val = 0; |
| } else if (input_val_centered >= input_range_radius) { |
| output_val = 255; |
| } else { |
| const int32 input_val_rescaled = |
| MultiplyByQuantizedMultiplierGreaterThanOne( |
| input_val_centered, input_multiplier, input_left_shift); |
| using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>; |
| using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>; |
| const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled); |
| const FixedPoint0 output_val_f0 = gemmlowp::logistic(input_val_f4); |
| // Convert from Q0.31 to Q23.8. |
| using gemmlowp::RoundingDivideByPOT; |
| int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 23); |
| if (output_val_s32 == 256) { |
| output_val_s32 = 255; |
| } |
| // Reinterpret as U0.8. |
| TFLITE_DCHECK_GE(output_val_s32, 0); |
| TFLITE_DCHECK_LE(output_val_s32, 255); |
| output_val = static_cast<uint8>(output_val_s32); |
| } |
| output_data[i] = output_val; |
| } |
| } |
| |
| // TODO(b/80418076): Move to legacy ops file, update invocations. |
| // Legacy. |
| inline void Logistic(const uint8* input_data, const RuntimeShape& input_shape, |
| int32 input_zero_point, int32 input_range_radius, |
| int32 input_multiplier, int input_left_shift, |
| uint8* output_data, const RuntimeShape& output_shape) { |
| LogisticParams params; |
| params.input_zero_point = input_zero_point; |
| params.input_range_radius = input_range_radius; |
| params.input_multiplier = input_multiplier; |
| params.input_left_shift = input_left_shift; |
| Logistic(params, input_shape, input_data, output_shape, output_data); |
| } |
| |
| inline void Logistic(const LogisticParams& params, |
| const RuntimeShape& input_shape, const int16* input_data, |
| const RuntimeShape& output_shape, int16* output_data) { |
| const int flat_size = MatchingFlatSize(input_shape, output_shape); |
| |
| for (int i = 0; i < flat_size; i++) { |
| // F0 uses 0 integer bits, range [-1, 1]. |
| // This is the return type of math functions such as tanh, logistic, |
| // whose range is in [-1, 1]. |
| using F0 = gemmlowp::FixedPoint<std::int16_t, 0>; |
| // F3 uses 3 integer bits, range [-8, 8], the input range expected here. |
| using F3 = gemmlowp::FixedPoint<std::int16_t, 3>; |
| |
| const F3 input = F3::FromRaw(input_data[i]); |
| F0 output = gemmlowp::logistic(input); |
| output_data[i] = output.raw(); |
| } |
| } |
| |
| // TODO(b/80418076): Move to legacy ops file, update invocations. |
| // Legacy. |
| inline void Logistic(const RuntimeShape& input_shape, const int16* input_data, |
| const RuntimeShape& output_shape, int16* output_data) { |
| LogisticParams params; |
| // No params currently needed by int16 Logistic. |
| Logistic(params, input_shape, input_data, output_shape, output_data); |
| } |
| |
| inline void Tanh(const RuntimeShape& input_shape, const float* input_data, |
| const RuntimeShape& output_shape, float* output_data) { |
| const int flat_size = MatchingFlatSize(input_shape, output_shape); |
| |
| for (int i = 0; i < flat_size; i++) { |
| float val = input_data[i]; |
| float result = std::tanh(val); |
| output_data[i] = result; |
| } |
| } |
| |
| // Convenience version that allows, for example, generated-code calls to be |
| // uniform between data types. |
| inline void Tanh(const TanhParams&, const RuntimeShape& input_shape, |
| const float* input_data, const RuntimeShape& output_shape, |
| float* output_data) { |
| // Drop params: not needed. |
| Tanh(input_shape, input_data, output_shape, output_data); |
| } |
| |
| inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape, |
| const uint8* input_data, const RuntimeShape& output_shape, |
| uint8* output_data) { |
| const int32 input_zero_point = params.input_zero_point; |
| const int32 input_range_radius = params.input_range_radius; |
| const int32 input_multiplier = params.input_multiplier; |
| const int input_left_shift = params.input_left_shift; |
| const int32 output_zero_point = 128; |
| const int flat_size = MatchingFlatSize(input_shape, output_shape); |
| |
| for (int i = 0; i < flat_size; i++) { |
| const uint8 input_val_u8 = input_data[i]; |
| const int32 input_val_centered = |
| static_cast<int32>(input_val_u8) - input_zero_point; |
| uint8 output_val; |
| if (input_val_centered <= -input_range_radius) { |
| output_val = 0; |
| } else if (input_val_centered >= input_range_radius) { |
| output_val = 255; |
| } else { |
| const int32 input_val_rescaled = |
| MultiplyByQuantizedMultiplierGreaterThanOne( |
| input_val_centered, input_multiplier, input_left_shift); |
| using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>; |
| using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>; |
| const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled); |
| const FixedPoint0 output_val_f0 = gemmlowp::tanh(input_val_f4); |
| // Convert from Q0.31 to Q24.7. |
| using gemmlowp::RoundingDivideByPOT; |
| int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 24); |
| output_val_s32 += output_zero_point; |
| if (output_val_s32 == 256) { |
| output_val_s32 = 255; |
| } |
| // Reinterpret as Q0.7, encoded in uint8. |
| TFLITE_DCHECK_GE(output_val_s32, 0); |
| TFLITE_DCHECK_LE(output_val_s32, 255); |
| output_val = static_cast<uint8>(output_val_s32); |
| } |
| output_data[i] = output_val; |
| } |
| } |
| |
| // TODO(b/80418076): Move to legacy ops file, update invocations. |
| // Legacy. |
| inline void Tanh(const uint8* input_data, const RuntimeShape& input_shape, |
| int32 input_zero_point, int32 input_range_radius, |
| int32 input_multiplier, int input_left_shift, |
| uint8* output_data, const RuntimeShape& output_shape) { |
| TanhParams params; |
| params.input_zero_point = input_zero_point; |
| params.input_range_radius = input_range_radius; |
| params.input_multiplier = input_multiplier; |
| params.input_left_shift = input_left_shift; |
| Tanh(params, input_shape, input_data, output_shape, output_data); |
| } |
| |
| inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape, |
| const int16* input_data, const RuntimeShape& output_shape, |
| int16* output_data) { |
| const int input_left_shift = params.input_left_shift; |
| // Support for shifts is limited until we have a parameterized version of |
| // SaturatingRoundingMultiplyByPOT(). |
| TFLITE_DCHECK_GE(input_left_shift, 0); |
| TFLITE_DCHECK_LE(input_left_shift, 1); |
| |
| const int flat_size = MatchingFlatSize(input_shape, output_shape); |
| |
| // F0 uses 0 integer bits, range [-1, 1]. |
| // This is the return type of math functions such as tanh, logistic, |
| // whose range is in [-1, 1]. |
| using F0 = gemmlowp::FixedPoint<std::int16_t, 0>; |
| // F3 uses 3 integer bits, range [-8, 8], the input range expected here. |
| using F3 = gemmlowp::FixedPoint<std::int16_t, 3>; |
| |
| if (input_left_shift == 0) { |
| for (int i = 0; i < flat_size; i++) { |
| F3 input = F3::FromRaw(input_data[i]); |
| F0 output = gemmlowp::tanh(input); |
| output_data[i] = output.raw(); |
| } |
| } else { |
| for (int i = 0; i < flat_size; i++) { |
| F3 input = F3::FromRaw( |
| gemmlowp::SaturatingRoundingMultiplyByPOT<1>(input_data[i])); |
| F0 output = gemmlowp::tanh(input); |
| output_data[i] = output.raw(); |
| } |
| } |
| } |
| |
| // TODO(b/80418076): Move to legacy ops file, update invocations. |
| // Legacy. |
| inline void Tanh(const int16* input_data, const RuntimeShape& input_shape, |
| int input_left_shift, int16* output_data, |
| const RuntimeShape& output_shape) { |
| TanhParams params; |
| params.input_left_shift = input_left_shift; |
| Tanh(params, input_shape, input_data, output_shape, output_data); |
| } |
| |
| inline void Dequantize(const tflite::DequantizationParams& op_params, |
| const RuntimeShape& input_shape, const uint8* input_data, |
| const RuntimeShape& output_shape, float* output_data) { |
| int32 zero_point = op_params.zero_point; |
| double scale = op_params.scale; |
| const int flat_size = MatchingFlatSize(input_shape, output_shape); |
| |
| for (int i = 0; i < flat_size; i++) { |
| int32 val = input_data[i]; |
| float result = static_cast<float>(scale * (val - zero_point)); |
| output_data[i] = result; |
| } |
| } |
| |
| // TODO(b/80418076): Move to legacy ops file, update invocations. |
| // Legacy Dims<4>. |
| inline void Dequantize(const uint8* input_data, const Dims<4>& input_dims, |
| int32 zero_point, double scale, float* output_data, |
| const Dims<4>& output_dims) { |
| tflite::DequantizationParams op_params; |
| op_params.zero_point = zero_point; |
| op_params.scale = scale; |
| |
| Dequantize(op_params, DimsToShape(input_dims), input_data, |
| DimsToShape(output_dims), output_data); |
| } |
| |
| inline void FakeQuant(const tflite::FakeQuantParams& op_params, |
| const RuntimeShape& input_shape, const float* input_data, |
| const RuntimeShape& output_shape, float* output_data) { |
| float rmin = op_params.minmax.min; |
| float rmax = op_params.minmax.max; |
| int num_bits = op_params.num_bits; |
| // 0 should always be a representable value. Let's assume that the initial |
| // min,max range contains 0. |
| TFLITE_DCHECK_LE(rmin, 0.0f); |
| TFLITE_DCHECK_GE(rmax, 0.0f); |
| TFLITE_DCHECK_LT(rmin, rmax); |
| |
| // Code matches tensorflow's FakeQuantWithMinMaxArgsFunctor. |
| int quant_min = 0; |
| int quant_max = (1 << num_bits) - 1; |
| float nudged_min, nudged_max, nudged_scale; |
| NudgeQuantizationRange(rmin, rmax, quant_min, quant_max, &nudged_min, |
| &nudged_max, &nudged_scale); |
| const int flat_size = MatchingFlatSize(input_shape, output_shape); |
| FakeQuantizeArray(nudged_scale, nudged_min, nudged_max, input_data, |
| output_data, flat_size); |
| } |
| |
| // TODO(b/80418076): Move to legacy ops file, update invocations. |
| // Legacy Dims<4>. |
| inline void FakeQuant(const float* input_data, const Dims<4>& input_dims, |
| float rmin, float rmax, int num_bits, float* output_data, |
| const Dims<4>& output_dims) { |
| tflite::FakeQuantParams op_params; |
| op_params.num_bits = num_bits; |
| op_params.minmax.min = rmin; |
| op_params.minmax.max = rmax; |
| |
| FakeQuant(op_params, DimsToShape(input_dims), input_data, |
| DimsToShape(output_dims), output_data); |
| } |
| |
| template <typename SrcT, typename DstT> |
| inline void Cast(const RuntimeShape& input_shape, const SrcT* input_data, |
| const RuntimeShape& output_shape, DstT* output_data) { |
| const int flat_size = MatchingFlatSize(input_shape, output_shape); |
| |
| for (int i = 0; i < flat_size; i++) { |
| int offset = i; |
| output_data[offset] = static_cast<DstT>(input_data[offset]); |
| } |
| } |
| |
| inline void Floor(const RuntimeShape& input_shape, const float* input_data, |
| const RuntimeShape& output_shape, float* output_data) { |
| const int flat_size = MatchingFlatSize(input_shape, output_shape); |
| |
| for (int i = 0; i < flat_size; i++) { |
| int offset = i; |
| output_data[offset] = std::floor(input_data[offset]); |
| } |
| } |
| |
| template <typename T> |
| inline void Gather(const tflite::GatherParams& op_params, |
| const RuntimeShape& input_shape, const T* input_data, |
| const RuntimeShape& coords_shape, const int32* coords_data, |
| const RuntimeShape& output_shape, T* output_data) { |
| // Enable these checks when moving legacy ops to legacy_reference_ops. |
| // |
| // TFLITE_DCHECK_EQ(coords_shape.DimensionsCount(), 1); |
| const int input_rank = op_params.input_rank; |
| const int gather_dimensions = output_shape.DimensionsCount(); |
| TFLITE_DCHECK_LE(input_shape.DimensionsCount(), gather_dimensions); |
| const int axis = gather_dimensions - input_rank; |
| TFLITE_DCHECK_LT(axis, gather_dimensions); |
| TFLITE_DCHECK_GE(axis, 0); |
| const int coords_count = coords_shape.FlatSize(); |
| TFLITE_DCHECK_EQ(coords_count, output_shape.Dims(axis)); |
| |
| int64_t stride = 1; |
| for (int i = axis + 1; i < gather_dimensions; ++i) { |
| stride *= input_shape.Dims(i); |
| } |
| T* out = output_data; |
| |
| for (int i = 0; i < coords_count; ++i) { |
| TFLITE_DCHECK_GE(coords_data[i], 0); |
| TFLITE_DCHECK_LT(coords_data[i], input_shape.Dims(axis)); |
| const T* in = input_data + coords_data[i] * stride; |
| memcpy(out, in, sizeof(T) * stride); |
| out += stride; |
| } |
| } |
| |
| // TODO(b/80418076): Move to legacy ops file, update invocations. |
| // Legacy Dims<4> version. |
| // When moving legacy ops to legacy_reference_ops, replace content with looser |
| // implementation. |
| template <typename T> |
| inline void Gather(const T* input_data, const Dims<4>& input_dims, |
| int input_rank, const int32* coords_data, |
| const Dims<4>& coords_dims, T* output_data, |
| const Dims<4>& output_dims) { |
| tflite::GatherParams op_params; |
| op_params.input_rank = input_rank; |
| |
| Gather(op_params, DimsToShape(input_dims), input_data, |
| DimsToShape(coords_dims), coords_data, DimsToShape(output_dims), |
| output_data); |
| } |
| |
| template <typename T> |
| inline void ResizeBilinear(const tflite::ResizeBilinearParams& op_params, |
| const RuntimeShape& unextended_input_shape, |
| const T* input_data, |
| const RuntimeShape& unextended_output_size_shape, |
| const int32* output_size_data, |
| const RuntimeShape& unextended_output_shape, |
| T* output_data) { |
| TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_LE(unextended_output_size_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); |
| const RuntimeShape input_shape = |
| RuntimeShape::ExtendedShape(4, unextended_input_shape); |
| const RuntimeShape output_size_shape = |
| RuntimeShape::ExtendedShape(4, unextended_output_size_shape); |
| const RuntimeShape output_shape = |
| RuntimeShape::ExtendedShape(4, unextended_output_shape); |
| |
| int32 batches = MatchingDim(input_shape, 0, output_shape, 0); |
| int32 input_height = input_shape.Dims(1); |
| int32 input_width = input_shape.Dims(2); |
| int32 depth = MatchingDim(input_shape, 3, output_shape, 3); |
| |
| TFLITE_DCHECK_EQ(output_size_shape.Dims(0), 1); |
| TFLITE_DCHECK_EQ(output_size_shape.Dims(1), 1); |
| TFLITE_DCHECK_EQ(output_size_shape.Dims(2), 1); |
| TFLITE_DCHECK_EQ(output_size_shape.Dims(3), 2); |
| int32 output_height = output_size_data[Offset(output_size_shape, 0, 0, 0, 0)]; |
| int32 output_width = output_size_data[Offset(output_size_shape, 0, 0, 0, 1)]; |
| |
| float height_scale = static_cast<float>(input_height) / output_height; |
| float width_scale = static_cast<float>(input_width) / output_width; |
| if (op_params.align_corners && output_height > 1) { |
| height_scale = static_cast<float>(input_height - 1) / (output_height - 1); |
| } |
| if (op_params.align_corners && output_width > 1) { |
| width_scale = static_cast<float>(input_width - 1) / (output_width - 1); |
| } |
| |
| for (int b = 0; b < batches; ++b) { |
| for (int y = 0; y < output_height; ++y) { |
| float input_y = y * height_scale; |
| int32 y0 = static_cast<int32>(std::floor(input_y)); |
| int32 y1 = std::min(y0 + 1, input_height - 1); |
| for (int x = 0; x < output_width; ++x) { |
| float input_x = x * width_scale; |
| int32 x0 = static_cast<int32>(std::floor(input_x)); |
| int32 x1 = std::min(x0 + 1, input_width - 1); |
| for (int c = 0; c < depth; ++c) { |
| T interpolation = |
| static_cast<T>(input_data[Offset(input_shape, b, y0, x0, c)] * |
| (1 - (input_y - y0)) * (1 - (input_x - x0)) + |
| input_data[Offset(input_shape, b, y1, x0, c)] * |
| (input_y - y0) * (1 - (input_x - x0)) + |
| input_data[Offset(input_shape, b, y0, x1, c)] * |
| (1 - (input_y - y0)) * (input_x - x0) + |
| input_data[Offset(input_shape, b, y1, x1, c)] * |
| (input_y - y0) * (input_x - x0)); |
| output_data[Offset(output_shape, b, y, x, c)] = interpolation; |
| } |
| } |
| } |
| } |
| } |
| |
| template <typename T> |
| inline void SpaceToBatchND( |
| const SpaceToBatchParams& params, |
| const RuntimeShape& unextended_input1_shape, const T* input1_data, |
| const RuntimeShape& unextended_input2_shape, const int32* block_shape_data, |
| const RuntimeShape& unextended_input3_shape, const int32* paddings_data, |
| const RuntimeShape& unextended_output_shape, T* output_data) { |
| TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); |
| const RuntimeShape input1_shape = |
| RuntimeShape::ExtendedShape(4, unextended_input1_shape); |
| const RuntimeShape output_shape = |
| RuntimeShape::ExtendedShape(4, unextended_output_shape); |
| |
| const int depth = input1_shape.Dims(3); |
| const int input_width = input1_shape.Dims(2); |
| const int input_height = input1_shape.Dims(1); |
| const int input_batch_size = input1_shape.Dims(0); |
| |
| const int output_width = output_shape.Dims(2); |
| const int output_height = output_shape.Dims(1); |
| const int output_batch_size = output_shape.Dims(0); |
| |
| const int block_shape_height = block_shape_data[0]; |
| const int block_shape_width = block_shape_data[1]; |
| const int padding_top = paddings_data[0]; |
| const int padding_left = paddings_data[2]; |
| |
| // For uint8 quantized, the correct padding "zero value" is the output offset. |
| const int32_t pad_value = params.output_offset; |
| |
| for (int out_b = 0; out_b < output_batch_size; ++out_b) { |
| int input_batch = out_b % input_batch_size; |
| int shift_w = (out_b / input_batch_size) % block_shape_width; |
| int shift_h = (out_b / input_batch_size) / block_shape_width; |
| for (int out_h = 0; out_h < output_height; ++out_h) { |
| for (int out_w = 0; out_w < output_width; ++out_w) { |
| T* out = output_data + Offset(output_shape, out_b, out_h, out_w, 0); |
| if (out_h * block_shape_height + shift_h < padding_top || |
| out_h * block_shape_height + shift_h >= |
| padding_top + input_height || |
| out_w * block_shape_width + shift_w < padding_left || |
| out_w * block_shape_width + shift_w >= padding_left + input_width) { |
| // This may not execute correctly when pad_value != 0 and T != uint8. |
| memset(out, pad_value, depth * sizeof(T)); |
| } else { |
| const T* in = |
| input1_data + |
| Offset(input1_shape, input_batch, |
| (out_h * block_shape_height + shift_h) - padding_top, |
| (out_w * block_shape_width + shift_w) - padding_left, 0); |
| memcpy(out, in, depth * sizeof(T)); |
| } |
| } |
| } |
| } |
| } |
| |
| template <typename T> |
| inline void BatchToSpaceND( |
| const RuntimeShape& unextended_input1_shape, const T* input1_data, |
| const RuntimeShape& unextended_input2_shape, const int32* block_shape_data, |
| const RuntimeShape& unextended_input3_shape, const int32* crops_data, |
| const RuntimeShape& unextended_output_shape, T* output_data) { |
| TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); |
| const RuntimeShape input1_shape = |
| RuntimeShape::ExtendedShape(4, unextended_input1_shape); |
| const RuntimeShape output_shape = |
| RuntimeShape::ExtendedShape(4, unextended_output_shape); |
| |
| const int output_width = output_shape.Dims(2); |
| const int output_height = output_shape.Dims(1); |
| const int output_batch_size = output_shape.Dims(0); |
| |
| const int depth = input1_shape.Dims(3); |
| const int input_width = input1_shape.Dims(2); |
| const int input_height = input1_shape.Dims(1); |
| const int input_batch_size = input1_shape.Dims(0); |
| |
| const int block_shape_width = block_shape_data[1]; |
| const int block_shape_height = block_shape_data[0]; |
| const int crops_top = crops_data[0]; |
| const int crops_left = crops_data[2]; |
| |
| for (int in_batch = 0; in_batch < input_batch_size; ++in_batch) { |
| const int out_batch = in_batch % output_batch_size; |
| const int spatial_offset = in_batch / output_batch_size; |
| for (int in_h = 0; in_h < input_height; ++in_h) { |
| const int out_h = in_h * block_shape_height + |
| spatial_offset / block_shape_width - crops_top; |
| if (out_h < 0 || out_h >= output_height) { |
| continue; |
| } |
| for (int in_w = 0; in_w < input_width; ++in_w) { |
| const int out_w = in_w * block_shape_width + |
| spatial_offset % block_shape_width - crops_left; |
| |
| if (out_w < 0 || out_w >= output_width) { |
| continue; |
| } |
| T* out = output_data + Offset(output_shape, out_batch, out_h, out_w, 0); |
| const T* in = |
| input1_data + Offset(input1_shape, in_batch, in_h, in_w, 0); |
| memcpy(out, in, depth * sizeof(T)); |
| } |
| } |
| } |
| } |
| |
| // There are two versions of pad: Pad and PadV2. In PadV2 there is a second |
| // scalar input that provides the padding value. Therefore pad_value_ptr can be |
| // equivalent to a simple input1_data. For Pad, it should point to a zero |
| // value. |
| // |
| // Note that two typenames are required, so that T=P=int32 is considered a |
| // specialization distinct from P=int32. |
| template <typename T, typename P> |
| inline void PadImpl(const tflite::PadParams& op_params, |
| const RuntimeShape& input_shape, const T* input_data, |
| const P* pad_value_ptr, const RuntimeShape& output_shape, |
| T* output_data) { |
| const RuntimeShape ext_input_shape = |
| RuntimeShape::ExtendedShape(4, input_shape); |
| const RuntimeShape ext_output_shape = |
| RuntimeShape::ExtendedShape(4, output_shape); |
| TFLITE_DCHECK_LE(op_params.left_padding_count, 4); |
| TFLITE_DCHECK_LE(op_params.right_padding_count, 4); |
| |
| // Runtime calls are currently fixed at 4 dimensions. Copy inputs so |
| // we can pad them to 4 dims (yes, we are "padding the padding"). |
| std::vector<int> left_padding_copy(4, 0); |
| for (int i = 0; i < op_params.left_padding_count; ++i) { |
| left_padding_copy[i] = op_params.left_padding[i]; |
| } |
| std::vector<int> right_padding_copy(4, 0); |
| for (int i = 0; i < op_params.right_padding_count; ++i) { |
| right_padding_copy[i] = op_params.right_padding[i]; |
| } |
| |
| const int output_batch = ext_output_shape.Dims(0); |
| const int output_height = ext_output_shape.Dims(1); |
| const int output_width = ext_output_shape.Dims(2); |
| const int output_depth = ext_output_shape.Dims(3); |
| |
| const int left_b_padding = left_padding_copy[0]; |
| const int left_h_padding = left_padding_copy[1]; |
| const int left_w_padding = left_padding_copy[2]; |
| const int left_d_padding = left_padding_copy[3]; |
| |
| const int right_b_padding = right_padding_copy[0]; |
| const int right_h_padding = right_padding_copy[1]; |
| const int right_w_padding = right_padding_copy[2]; |
| const int right_d_padding = right_padding_copy[3]; |
| |
| const T pad_value = *pad_value_ptr; |
| |
| const T* in_ptr = input_data; |
| T* out_ptr = output_data; |
| for (int out_b = 0; out_b < output_batch; ++out_b) { |
| for (int out_h = 0; out_h < output_height; ++out_h) { |
| for (int out_w = 0; out_w < output_width; ++out_w) { |
| for (int out_d = 0; out_d < output_depth; ++out_d) { |
| if (out_b < left_b_padding || |
| out_b >= output_batch - right_b_padding || |
| out_h < left_h_padding || |
| out_h >= output_height - right_h_padding || |
| out_w < left_w_padding || |
| out_w >= output_width - right_w_padding || |
| out_d < left_d_padding || |
| out_d >= output_depth - right_d_padding) { |
| *out_ptr++ = pad_value; |
| } else { |
| *out_ptr++ = *in_ptr++; |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| template <typename T, typename P> |
| inline void Pad(const tflite::PadParams& op_params, |
| const RuntimeShape& input_shape, const T* input_data, |
| const P* pad_value_ptr, const RuntimeShape& output_shape, |
| T* output_data) { |
| PadImpl(op_params, input_shape, input_data, pad_value_ptr, output_shape, |
| output_data); |
| } |
| |
| // The second (pad-value) input can be int32 when, say, the first is uint8. |
| template <typename T> |
| inline void Pad(const tflite::PadParams& op_params, |
| const RuntimeShape& input_shape, const T* input_data, |
| const int32* pad_value_ptr, const RuntimeShape& output_shape, |
| T* output_data) { |
| const T converted_pad_value = static_cast<T>(*pad_value_ptr); |
| PadImpl(op_params, input_shape, input_data, &converted_pad_value, |
| output_shape, output_data); |
| } |
| |
| // This version avoids conflicting template matching. |
| template <> |
| inline void Pad(const tflite::PadParams& op_params, |
| const RuntimeShape& input_shape, const int32* input_data, |
| const int32* pad_value_ptr, const RuntimeShape& output_shape, |
| int32* output_data) { |
| PadImpl(op_params, input_shape, input_data, pad_value_ptr, output_shape, |
| output_data); |
| } |
| |
| template <typename T> |
| inline void StridedSlice(const tflite::StridedSliceParams& op_params, |
| const RuntimeShape& unextended_input_shape, |
| const T* input_data, |
| const RuntimeShape& unextended_output_shape, |
| T* output_data) { |
| // Note that the output_shape is not used herein. |
| tflite::StridedSliceParams params_copy = op_params; |
| |
| TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); |
| const RuntimeShape input_shape = |
| RuntimeShape::ExtendedShape(4, unextended_input_shape); |
| const RuntimeShape output_shape = |
| RuntimeShape::ExtendedShape(4, unextended_output_shape); |
| |
| // Reverse and pad to 4 dimensions because that is what the runtime code |
| // requires (ie. all shapes must be 4D and are given backwards). |
| strided_slice::StridedSlicePadIndices(¶ms_copy, 4); |
| |
| const int start_b = strided_slice::StartForAxis(params_copy, input_shape, 0); |
| const int stop_b = |
| strided_slice::StopForAxis(params_copy, input_shape, 0, start_b); |
| const int start_h = strided_slice::StartForAxis(params_copy, input_shape, 1); |
| const int stop_h = |
| strided_slice::StopForAxis(params_copy, input_shape, 1, start_h); |
| const int start_w = strided_slice::StartForAxis(params_copy, input_shape, 2); |
| const int stop_w = |
| strided_slice::StopForAxis(params_copy, input_shape, 2, start_w); |
| const int start_d = strided_slice::StartForAxis(params_copy, input_shape, 3); |
| const int stop_d = |
| strided_slice::StopForAxis(params_copy, input_shape, 3, start_d); |
| |
| T* out_ptr = output_data; |
| for (int in_b = start_b; |
| !strided_slice::LoopCondition(in_b, stop_b, params_copy.strides[0]); |
| in_b += params_copy.strides[0]) { |
| for (int in_h = start_h; |
| !strided_slice::LoopCondition(in_h, stop_h, params_copy.strides[1]); |
| in_h += params_copy.strides[1]) { |
| for (int in_w = start_w; |
| !strided_slice::LoopCondition(in_w, stop_w, params_copy.strides[2]); |
| in_w += params_copy.strides[2]) { |
| for (int in_d = start_d; !strided_slice::LoopCondition( |
| in_d, stop_d, params_copy.strides[3]); |
| in_d += params_copy.strides[3]) { |
| *out_ptr++ = input_data[Offset(input_shape, in_b, in_h, in_w, in_d)]; |
| } |
| } |
| } |
| } |
| } |
| |
| // TODO(b/80418076): Move to legacy ops file, update invocations. |
| // Legacy. |
| inline uint32 LegacyReverseBits32(uint32 n) { |
| n = ((n >> 1) & 0x55555555) | ((n & 0x55555555) << 1); |
| n = ((n >> 2) & 0x33333333) | ((n & 0x33333333) << 2); |
| n = ((n >> 4) & 0x0F0F0F0F) | ((n & 0x0F0F0F0F) << 4); |
| return (((n & 0xFF) << 24) | ((n & 0xFF00) << 8) | ((n & 0xFF0000) >> 8) | |
| ((n & 0xFF000000) >> 24)); |
| } |
| |
| inline void StridedSliceReverseIndices(tflite::StridedSliceParams* p) { |
| TFLITE_CHECK_EQ(p->start_indices_count, p->stop_indices_count); |
| TFLITE_CHECK_EQ(p->stop_indices_count, p->strides_count); |
| |
| std::reverse(p->start_indices, p->start_indices + p->start_indices_count); |
| std::reverse(p->stop_indices, p->stop_indices + p->stop_indices_count); |
| std::reverse(p->strides, p->strides + p->strides_count); |
| |
| p->begin_mask = LegacyReverseBits32(static_cast<uint32>(p->begin_mask)) >> |
| (32 - p->start_indices_count); |
| p->ellipsis_mask = |
| LegacyReverseBits32(static_cast<uint32>(p->ellipsis_mask)) >> |
| (32 - p->start_indices_count); |
| p->end_mask = LegacyReverseBits32(static_cast<uint32>(p->end_mask)) >> |
| (32 - p->start_indices_count); |
| p->new_axis_mask = |
| LegacyReverseBits32(static_cast<uint32>(p->new_axis_mask)) >> |
| (32 - p->start_indices_count); |
| p->shrink_axis_mask = |
| LegacyReverseBits32(static_cast<uint32>(p->shrink_axis_mask)) >> |
| (32 - p->start_indices_count); |
| } |
| |
| // TODO(b/80418076): Move to legacy ops file, update invocations. |
| // Legacy. |
| template <typename T> |
| inline void StridedSlice(const T* input_data, const Dims<4>& input_dims, |
| int begin_mask, int end_mask, int shrink_axis_mask, |
| const std::vector<int>& start_indices, |
| const std::vector<int>& stop_indices, |
| const std::vector<int>& strides, T* output_data, |
| const Dims<4>& output_dims) { |
| TFLITE_DCHECK_EQ(start_indices.size(), 4); |
| auto op_params = strided_slice::BuildStridedSliceParams( |
| begin_mask, end_mask, shrink_axis_mask, start_indices, stop_indices, |
| strides); |
| StridedSliceReverseIndices(&op_params); |
| |
| StridedSlice(op_params, DimsToShape(input_dims), input_data, |
| DimsToShape(output_dims), output_data); |
| } |
| |
| template <typename T> |
| inline void Slice(const tflite::SliceParams& op_params, |
| const RuntimeShape& input_shape, const T* input_data, |
| const RuntimeShape& output_shape, T* output_data) { |
| const RuntimeShape ext_shape = RuntimeShape::ExtendedShape(4, input_shape); |
| // TODO(dkalenichenko): This op only supports 4D tensors or smaller. |
| TFLITE_DCHECK_LE(op_params.begin_count, 4); |
| TFLITE_DCHECK_LE(op_params.size_count, 4); |
| const int begin_count = op_params.begin_count; |
| const int size_count = op_params.size_count; |
| // We front-pad the begin and size vectors. |
| const int start_b = 4 - begin_count > 0 ? 0 : op_params.begin[0]; |
| const int stop_b = (4 - size_count > 0 || op_params.size[0] == -1) |
| ? ext_shape.Dims(0) - start_b |
| : start_b + op_params.size[0]; |
| const int start_h = begin_count < 3 ? 0 : op_params.begin[begin_count - 3]; |
| const int stop_h = (size_count < 3 || op_params.size[size_count - 3] == -1) |
| ? ext_shape.Dims(1) - start_h |
| : start_h + op_params.size[size_count - 3]; |
| const int start_w = begin_count < 2 ? 0 : op_params.begin[begin_count - 2]; |
| const int stop_w = (size_count < 2 || op_params.size[size_count - 2] == -1) |
| ? ext_shape.Dims(2) - start_w |
| : start_w + op_params.size[size_count - 2]; |
| const int start_d = begin_count < 1 ? 0 : op_params.begin[begin_count - 1]; |
| const int stop_d = (size_count < 1 || op_params.size[size_count - 1] == -1) |
| ? ext_shape.Dims(3) - start_d |
| : start_d + op_params.size[size_count - 1]; |
| |
| T* out_ptr = output_data; |
| for (int in_b = start_b; in_b < stop_b; ++in_b) { |
| for (int in_h = start_h; in_h < stop_h; ++in_h) { |
| for (int in_w = start_w; in_w < stop_w; ++in_w) { |
| for (int in_d = start_d; in_d < stop_d; ++in_d) { |
| *out_ptr++ = input_data[Offset(ext_shape, in_b, in_h, in_w, in_d)]; |
| } |
| } |
| } |
| } |
| } |
| |
| template <typename T> |
| inline void Exp(const T* input_data, const size_t num_elements, |
| T* output_data) { |
| for (size_t idx = 0; idx < num_elements; ++idx) { |
| output_data[idx] = exp(input_data[idx]); |
| } |
| } |
| |
| // A generic reduce method that can be used for reduce_sum, reduce_mean, etc. |
| // This method iterates through input data and reduce elements along the |
| // dimensions given in axis. |
| template <typename In, typename Out> |
| inline bool Reduce(const In* input_data, const int* input_dims, |
| const int* output_dims, const int input_num_dims, |
| const int output_num_dims, const int* axis, |
| const int num_axis, int* input_iter, |
| Out reducer(const Out current, const In in), |
| Out* output_data) { |
| // Reset input iterator. |
| for (int idx = 0; idx < input_num_dims; ++idx) { |
| input_iter[idx] = 0; |
| } |
| // Iterate through input_data. |
| do { |
| size_t input_offset = |
| ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr); |
| size_t output_offset = ReducedOutputOffset(input_num_dims, input_dims, |
| input_iter, num_axis, axis); |
| output_data[output_offset] = |
| reducer(output_data[output_offset], input_data[input_offset]); |
| } while (NextIndex(input_num_dims, input_dims, input_iter)); |
| return true; |
| } |
| |
| inline bool ResolveAxis(const int num_dims, const int* axis, |
| const int64_t num_axis, int* out_axis, |
| int* out_num_axis) { |
| *out_num_axis = 0; // Just in case. |
| // Short-circuit axis resolution for scalars; the axis will go unused. |
| if (num_dims == 0) { |
| return true; |
| } |
| // o(n^2) is fine since out_num_axis should be really small, mostly <= 4 |
| for (int64_t idx = 0; idx < num_axis; ++idx) { |
| // Handle negative index. |
| int current = axis[idx] < 0 ? (axis[idx] + num_dims) : axis[idx]; |
| TFLITE_DCHECK(current >= 0 && current < num_dims); |
| bool is_dup = false; |
| for (int j = 0; j < *out_num_axis; ++j) { |
| if (out_axis[j] == current) { |
| is_dup = true; |
| break; |
| } |
| } |
| if (!is_dup) { |
| out_axis[*out_num_axis] = current; |
| *out_num_axis += 1; |
| } |
| } |
| return true; |
| } |
| |
| // This method expects that output_data has been initialized. |
| template <typename In, typename Out> |
| inline bool ReduceSumImpl(const In* input_data, const int* input_dims, |
| const int* output_dims, const int input_num_dims, |
| const int output_num_dims, const int* axis, |
| const int num_axis, int* input_iter, |
| Out* output_data) { |
| auto reducer = [](const Out current, const In in) -> Out { |
| const Out actual_in = static_cast<Out>(in); |
| return current + actual_in; |
| }; |
| return Reduce<In, Out>(input_data, input_dims, output_dims, input_num_dims, |
| output_num_dims, axis, num_axis, input_iter, reducer, |
| output_data); |
| } |
| |
| template <typename T> |
| inline bool InitTensorDataForReduce(const int* dims, const int num_dims, |
| const T init_value, T* data) { |
| size_t num_elements = 1; |
| for (int idx = 0; idx < num_dims; ++idx) { |
| size_t current = static_cast<size_t>(dims[idx]); |
| // Overflow prevention. |
| if (num_elements > std::numeric_limits<size_t>::max() / current) { |
| return false; |
| } |
| num_elements *= current; |
| } |
| for (size_t idx = 0; idx < num_elements; ++idx) { |
| data[idx] = init_value; |
| } |
| return true; |
| } |
| |
| // Computes the generic value (i.e., sum/max/min/prod) of elements across |
| // dimensions given in axis. It needs to pass in init_value and reducer. |
| template <typename T> |
| inline bool ReduceGeneric(const T* input_data, const int* input_dims, |
| const int input_num_dims, T* output_data, |
| const int* output_dims, const int output_num_dims, |
| const int* axis, const int64_t num_axis_dimensions, |
| bool keep_dims, int* temp_index, int* resolved_axis, |
| T init_value, |
| T reducer(const T current, const T in)) { |
| // Reset output data. |
| if (!InitTensorDataForReduce(output_dims, output_num_dims, init_value, |
| output_data)) { |
| return false; |
| } |
| |
| // Resolve axis. |
| int num_resolved_axis = 0; |
| if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis, |
| &num_resolved_axis)) { |
| return false; |
| } |
| |
| return Reduce<T, T>(input_data, input_dims, output_dims, input_num_dims, |
| output_num_dims, resolved_axis, num_resolved_axis, |
| temp_index, reducer, output_data); |
| } |
| |
| // Computes the mean of elements across dimensions given in axis. |
| // It does so in two stages, first calculates the sum of elements along the axis |
| // then divides it by the number of element in axis. |
| template <typename T, typename U> |
| inline bool Mean(const T* input_data, const int* input_dims, |
| const int input_num_dims, T* output_data, |
| const int* output_dims, const int output_num_dims, |
| const int* axis, const int num_axis_dimensions, bool keep_dims, |
| int* temp_index, int* resolved_axis, U* temp_sum) { |
| // Reset output data. |
| size_t num_outputs = 1; |
| for (int idx = 0; idx < output_num_dims; ++idx) { |
| size_t current = static_cast<size_t>(output_dims[idx]); |
| // Overflow prevention. |
| if (num_outputs > std::numeric_limits<size_t>::max() / current) { |
| return false; |
| } |
| num_outputs *= current; |
| } |
| for (size_t idx = 0; idx < num_outputs; ++idx) { |
| output_data[idx] = T(); |
| temp_sum[idx] = U(); |
| } |
| |
| // Resolve axis. |
| int num_resolved_axis = 0; |
| if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis, |
| &num_resolved_axis)) { |
| return false; |
| } |
| |
| if (!ReduceSumImpl<T, U>(input_data, input_dims, output_dims, input_num_dims, |
| output_num_dims, resolved_axis, num_resolved_axis, |
| temp_index, temp_sum)) { |
| return false; |
| } |
| |
| // Calculate mean by dividing output_data by num of aggregated element. |
| U num_elements_in_axis = 1; |
| for (int idx = 0; idx < num_resolved_axis; ++idx) { |
| size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]); |
| // Overflow prevention. |
| if (current > (std::numeric_limits<U>::max() / num_elements_in_axis)) { |
| return false; |
| } |
| num_elements_in_axis *= current; |
| } |
| |
| if (num_elements_in_axis > 0) { |
| for (size_t idx = 0; idx < num_outputs; ++idx) { |
| output_data[idx] = |
| static_cast<T>(temp_sum[idx] / static_cast<U>(num_elements_in_axis)); |
| } |
| } |
| return true; |
| } |
| |
| template <typename T> |
| inline void Mean(const tflite::MeanParams& op_params, |
| const RuntimeShape& unextended_input_shape, |
| const T* input_data, |
| const RuntimeShape& unextended_output_shape, T* output_data) { |
| gemmlowp::ScopedProfilingLabel label("Mean"); |
| |
| TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); |
| const RuntimeShape input_shape = |
| RuntimeShape::ExtendedShape(4, unextended_input_shape); |
| const RuntimeShape output_shape = |
| RuntimeShape::ExtendedShape(4, unextended_output_shape); |
| |
| const int output_batch = output_shape.Dims(0); |
| const int output_height = output_shape.Dims(1); |
| const int output_width = output_shape.Dims(2); |
| const int output_depth = output_shape.Dims(3); |
| |
| const int input_height = input_shape.Dims(1); |
| const int input_width = input_shape.Dims(2); |
| |
| // The current implementation only supports simultaneous reduction over |
| // width and height. |
| TFLITE_DCHECK_EQ(op_params.axis_count, 2); |
| TFLITE_DCHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) || |
| (op_params.axis[0] == 2 && op_params.axis[1] == 1)); |
| TFLITE_DCHECK_EQ(output_height, 1); |
| TFLITE_DCHECK_EQ(output_width, 1); |
| |
| for (int out_b = 0; out_b < output_batch; ++out_b) { |
| for (int out_d = 0; out_d < output_depth; ++out_d) { |
| float value = 0; |
| for (int in_h = 0; in_h < input_height; ++in_h) { |
| for (int in_w = 0; in_w < input_width; ++in_w) { |
| value += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)]; |
| } |
| } |
| output_data[Offset(output_shape, out_b, 0, 0, out_d)] = |
| value / (input_width * input_height); |
| } |
| } |
| } |
| |
| // TODO(b/80418076): Move to legacy ops file, update invocations. |
| // Legacy Dims<4>. |
| template <typename T> |
| inline void Mean(const T* input_data, const Dims<4>& input_dims, |
| const std::vector<int>& reduction_indices, T* output_data, |
| const Dims<4>& output_dims) { |
| tflite::MeanParams op_params; |
| op_params.axis_count = reduction_indices.size(); |
| for (int i = 0; i < op_params.axis_count; ++i) { |
| op_params.axis[i] = reduction_indices[op_params.axis_count - 1 - i]; |
| } |
| |
| Mean(op_params, DimsToShape(input_dims), input_data, DimsToShape(output_dims), |
| output_data); |
| } |
| |
| // Computes the mean of elements across dimensions given in axis. |
| // It does so in two stages, first calculates the sum of elements along the axis |
| // then divides it by the number of element in axis for quantized values. |
| template <typename T, typename U> |
| inline bool Mean(const T* input_data, int32 input_zero_point, float input_scale, |
| const int* input_dims, const int input_num_dims, |
| T* output_data, int32 output_zero_point, float output_scale, |
| const int* output_dims, const int output_num_dims, |
| const int* axis, const int num_axis_dimensions, bool keep_dims, |
| int* temp_index, int* resolved_axis, U* temp_sum) { |
| // Reset output data. |
| size_t num_outputs = 1; |
| for (int idx = 0; idx < output_num_dims; ++idx) { |
| size_t current = static_cast<size_t>(output_dims[idx]); |
| // Overflow prevention. |
| if (num_outputs > std::numeric_limits<size_t>::max() / current) { |
| return false; |
| } |
| num_outputs *= current; |
| } |
| for (size_t idx = 0; idx < num_outputs; ++idx) { |
| output_data[idx] = T(); |
| temp_sum[idx] = U(); |
| } |
| |
| // Resolve axis. |
| int num_resolved_axis = 0; |
| if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis, |
| &num_resolved_axis)) { |
| return false; |
| } |
| |
| if (!ReduceSumImpl<T, U>(input_data, input_dims, output_dims, input_num_dims, |
| output_num_dims, resolved_axis, num_resolved_axis, |
| temp_index, temp_sum)) { |
| return false; |
| } |
| |
| // Calculate mean by dividing output_data by num of aggregated element. |
| U num_elements_in_axis = 1; |
| for (int idx = 0; idx < num_resolved_axis; ++idx) { |
| size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]); |
| // Overflow prevention. |
| if (current > (std::numeric_limits<U>::max() / num_elements_in_axis)) { |
| return false; |
| } |
| num_elements_in_axis *= current; |
| } |
| |
| if (num_elements_in_axis > 0) { |
| const float scale = input_scale / output_scale; |
| const float bias = -input_zero_point * scale; |
| for (size_t idx = 0; idx < num_outputs; ++idx) { |
| float float_mean = static_cast<float>(temp_sum[idx]) / |
| static_cast<float>(num_elements_in_axis); |
| |
| // Convert to float value. |
| output_data[idx] = |
| static_cast<T>(round(float_mean * scale + bias)) + output_zero_point; |
| } |
| } |
| return true; |
| } |
| |
| template <typename T> |
| void Minimum(const RuntimeShape& input1_shape, const T* input1_data, |
| const T* input2_data, const RuntimeShape& output_shape, |
| T* output_data) { |
| const int flat_size = MatchingFlatSize(input1_shape, output_shape); |
| |
| auto min_value = input2_data[0]; |
| for (int i = 0; i < flat_size; i++) { |
| output_data[i] = input1_data[i] > min_value ? min_value : input1_data[i]; |
| } |
| } |
| |
| // Convenience version that allows, for example, generated-code calls to be |
| // the same as other binary ops. |
| template <typename T> |
| inline void Minimum(const RuntimeShape& input1_shape, const T* input1_data, |
| const RuntimeShape&, const T* input2_data, |
| const RuntimeShape& output_shape, T* output_data) { |
| // Drop shape of second input: not needed. |
| Minimum(input1_shape, input1_data, input2_data, output_shape, output_data); |
| } |
| |
| template <typename T> |
| void Maximum(const RuntimeShape& input1_shape, const T* input1_data, |
| const T* input2_data, const RuntimeShape& output_shape, |
| T* output_data) { |
| const int flat_size = MatchingFlatSize(input1_shape, output_shape); |
| |
| auto max_value = input2_data[0]; |
| for (int i = 0; i < flat_size; i++) { |
| output_data[i] = input1_data[i] < max_value ? max_value : input1_data[i]; |
| } |
| } |
| |
| // Convenience version that allows, for example, generated-code calls to be |
| // the same as other binary ops. |
| template <typename T> |
| inline void Maximum(const RuntimeShape& input1_shape, const T* input1_data, |
| const RuntimeShape&, const T* input2_data, |
| const RuntimeShape& output_shape, T* output_data) { |
| // Drop shape of second input: not needed. |
| Maximum(input1_shape, input1_data, input2_data, output_shape, output_data); |
| } |
| |
| template <typename T, typename Op> |
| void MaximumMinimumBroadcast4DSlow(const RuntimeShape& unextended_input1_shape, |
| const T* input1_data, |
| const RuntimeShape& unextended_input2_shape, |
| const T* input2_data, |
| const RuntimeShape& unextended_output_shape, |
| T* output_data, Op op) { |
| TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); |
| const RuntimeShape output_shape = |
| RuntimeShape::ExtendedShape(4, unextended_output_shape); |
| |
| NdArrayDesc<4> desc1; |
| NdArrayDesc<4> desc2; |
| NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, |
| unextended_input2_shape, &desc1, &desc2); |
| |
| for (int b = 0; b < output_shape.Dims(0); ++b) { |
| for (int y = 0; y < output_shape.Dims(1); ++y) { |
| for (int x = 0; x < output_shape.Dims(2); ++x) { |
| for (int c = 0; c < output_shape.Dims(3); ++c) { |
| auto out_idx = Offset(output_shape, b, y, x, c); |
| auto in1_idx = SubscriptToIndex(desc1, b, y, x, c); |
| auto in2_idx = SubscriptToIndex(desc2, b, y, x, c); |
| auto in1_val = input1_data[in1_idx]; |
| auto in2_val = input2_data[in2_idx]; |
| output_data[out_idx] = op(in1_val, in2_val); |
| } |
| } |
| } |
| } |
| } |
| |
| template <typename T1, typename T2, typename T3, typename Cmp> |
| void ArgMinMax(const RuntimeShape& input1_shape, const T1* input1_data, |
| const T3* input2_data, const RuntimeShape& output_shape, |
| T2* output_data, const Cmp& cmp) { |
| // The current ArgMax implemention can only determine the index of the maximum |
| // value in the last dimension. So the axis argument is ignored. |
| |
| // For ArgMax, the number of output dimensions = (number of input dimensions - |
| // 1). For the sake of simplicity, the output dimensions are equal to the |
| // input dimensions here. We enforce the constraint that the last dimension |
| // must always be 1. |
| const int trailing_dim = output_shape.DimensionsCount() - 1; |
| TFLITE_DCHECK_EQ(input1_shape.DimensionsCount(), |
| output_shape.DimensionsCount()); |
| TFLITE_DCHECK_EQ(output_shape.Dims(trailing_dim), 1); |
| const int outer_size = |
| MatchingFlatSizeSkipDim(input1_shape, trailing_dim, output_shape); |
| const int depth = input1_shape.Dims(trailing_dim); |
| |
| for (int i = 0; i < outer_size; ++i) { |
| auto min_max_value = input1_data[i * depth]; |
| int min_max_index = 0; |
| for (int d = 1; d < depth; ++d) { |
| const auto& curr_value = input1_data[i * depth + d]; |
| if (cmp(curr_value, min_max_value)) { |
| min_max_value = curr_value; |
| min_max_index = d; |
| } |
| } |
| output_data[i] = min_max_index; |
| } |
| } |
| |
| template <typename T1, typename T2, typename T3> |
| void ArgMax(const RuntimeShape& input1_shape, const T1* input1_data, |
| const T3* input2_data, const RuntimeShape& output_shape, |
| T2* output_data) { |
| ArgMinMax(input1_shape, input1_data, input2_data, output_shape, output_data, |
| std::greater<T1>()); |
| } |
| |
| // Convenience version that allows, for example, generated-code calls to be |
| // the same as other binary ops. |
| template <typename T1, typename T2, typename T3> |
| inline void ArgMax(const RuntimeShape& input1_shape, const T1* input1_data, |
| const RuntimeShape& input2_shape, const T3* input2_data, |
| const RuntimeShape& output_shape, T2* output_data) { |
| // Drop shape of second input: not needed. |
| ArgMax(input1_shape, input1_data, input2_data, output_shape, output_data); |
| } |
| |
| template <typename T> |
| void Transpose(const TransposeParams& params, |
| const RuntimeShape& unextended_input_shape, const T* input_data, |
| const RuntimeShape& unextended_output_shape, T* output_data) { |
| const int unextended_output_size = unextended_output_shape.DimensionsCount(); |
| TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_LE(unextended_output_size, 4); |
| TFLITE_DCHECK_EQ(unextended_output_size, params.perm_count); |
| const RuntimeShape input_shape = |
| RuntimeShape::ExtendedShape(4, unextended_input_shape); |
| const RuntimeShape output_shape = |
| RuntimeShape::ExtendedShape(4, unextended_output_shape); |
| const int input_ext_size = 4 - unextended_input_shape.DimensionsCount(); |
| const int output_ext_size = 4 - unextended_output_size; |
| |
| // The perm data is extended to match the output, each index incremented by |
| // the amount of front padding of the input shape. |
| int extended_perm[4]; |
| for (int i = 0; i < output_ext_size; ++i) { |
| extended_perm[i] = i; |
| } |
| for (int i = 0; i < unextended_output_size; ++i) { |
| extended_perm[i + output_ext_size] = params.perm[i] + input_ext_size; |
| } |
| |
| int out_sizes[4]; |
| // Compute the inverse permutation array so we can do an output centered |
| // transpose. Also, check to make sure output_dims is matching input_dims. |
| for (int k = 0; k < 4; k++) { |
| out_sizes[k] = MatchingDim(input_shape, extended_perm[k], output_shape, k); |
| } |
| |
| // Naive transpose loop (iterate on output index and compute input index). |
| int o[4]; // loop index (on output). |
| int i[4]; |
| for (o[3] = 0; o[3] < out_sizes[3]; o[3]++) { |
| i[extended_perm[3]] = o[3]; |
| for (o[2] = 0; o[2] < out_sizes[2]; o[2]++) { |
| i[extended_perm[2]] = o[2]; |
| for (o[1] = 0; o[1] < out_sizes[1]; o[1]++) { |
| i[extended_perm[1]] = o[1]; |
| for (o[0] = 0; o[0] < out_sizes[0]; o[0]++) { |
| i[extended_perm[0]] = o[0]; |
| output_data[Offset(output_shape, o)] = |
| input_data[Offset(input_shape, i)]; |
| } |
| } |
| } |
| } |
| } |
| |
| // TODO(b/80418076): Move to legacy ops file, update invocations. |
| // Legacy. |
| template <typename T> |
| void Transpose(const T* input, const Dims<4>& input_dims, T* output, |
| const Dims<4>& output_dims, const int* permuted_axes) { |
| TransposeParams params; |
| params.perm_count = 4; |
| for (int i = 0; i < 4; ++i) { |
| params.perm[i] = 3 - permuted_axes[3 - i]; |
| } |
| Transpose(params, DimsToShape(input_dims), input, DimsToShape(output_dims), |
| output); |
| } |
| |
| inline void TransposeConv( |
| const ConvParams& params, const RuntimeShape& input_shape, |
| const float* input_data, const RuntimeShape& filter_shape, |
| const float* filter_data, const RuntimeShape& output_shape, |
| float* output_data, const RuntimeShape& im2col_shape, float* im2col_data) { |
| const int stride_width = params.stride_width; |
| const int stride_height = params.stride_height; |
| const int pad_width = params.padding_values.width; |
| const int pad_height = params.padding_values.height; |
| TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); |
| (void)im2col_data; // only used in optimized code. |
| (void)im2col_shape; // only used in optimized code. |
| |
| const int batches = MatchingDim(input_shape, 0, output_shape, 0); |
| const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3); |
| const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3); |
| const int input_height = input_shape.Dims(1); |
| const int input_width = input_shape.Dims(2); |
| const int filter_height = filter_shape.Dims(1); |
| const int filter_width = filter_shape.Dims(2); |
| const int output_height = output_shape.Dims(1); |
| const int output_width = output_shape.Dims(2); |
| |
| // Although transpose convolution simplifies to convolution with transposed |
| // weights for strides of 1, non-unitary striding complicates matters. To |
| // keep this reference implementation as clear as possible, we use a |
| // "scatter" access pattern, where we loop through all the input elements, |
| // computing their influence on the output, rather than looping through the |
| // output elements in the typical "gather" access pattern of a conv. We |
| // therefore must initialize the output array to zero. |
| const int num_elements = output_shape.FlatSize(); |
| for (int i = 0; i < num_elements; i++) { |
| output_data[i] = 0.0f; |
| } |
| |
| // Loop through input elements one at a time. |
| for (int batch = 0; batch < batches; ++batch) { |
| for (int in_y = 0; in_y < input_height; ++in_y) { |
| for (int in_x = 0; in_x < input_width; ++in_x) { |
| for (int in_channel = 0; in_channel < input_depth; ++in_channel) { |
| // Loop through the output elements it will influence |
| const int out_x_origin = (in_x * stride_width) - pad_width; |
| const int out_y_origin = (in_y * stride_height) - pad_height; |
| for (int filter_y = 0; filter_y < filter_height; ++filter_y) { |
| for (int filter_x = 0; filter_x < filter_width; ++filter_x) { |
| for (int out_channel = 0; out_channel < output_depth; |
| ++out_channel) { |
| // Compute output element location |
| const int out_x = out_x_origin + filter_x; |
| const int out_y = out_y_origin + filter_y; |
| // We cannot accumulate out of bounds |
| if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) && |
| (out_y < output_height)) { |
| float input_value = input_data[Offset( |
| input_shape, batch, in_y, in_x, in_channel)]; |
| float filter_value = |
| filter_data[Offset(filter_shape, out_channel, filter_y, |
| filter_x, in_channel)]; |
| output_data[Offset(output_shape, batch, out_y, out_x, |
| out_channel)] += |
| input_value * filter_value; |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| // TODO(b/80418076): Move to legacy ops file, update invocations. |
| // Legacy. |
| inline void TransposeConv(const float* input_data, const Dims<4>& input_dims, |
| const float* filter_data, const Dims<4>& filter_dims, |
| int stride_width, int stride_height, int pad_width, |
| int pad_height, float* output_data, |
| const Dims<4>& output_dims, float* im2col_data, |
| const Dims<4>& im2col_dims) { |
| tflite::ConvParams op_params; |
| // Padding type is ignored, but still set. |
| op_params.padding_type = PaddingType::kSame; |
| op_params.padding_values.width = pad_width; |
| op_params.padding_values.height = pad_height; |
| op_params.stride_width = stride_width; |
| op_params.stride_height = stride_height; |
| |
| TransposeConv(op_params, DimsToShape(input_dims), input_data, |
| DimsToShape(filter_dims), filter_data, DimsToShape(output_dims), |
| output_data, DimsToShape(im2col_dims), im2col_data); |
| } |
| |
| template <typename T> |
| inline bool EqualFn(T lhs, T rhs) { |
| return lhs == rhs; |
| } |
| |
| template <typename T> |
| inline bool NotEqualFn(T lhs, T rhs) { |
| return lhs != rhs; |
| } |
| |
| template <typename T> |
| inline bool GreaterFn(T lhs, T rhs) { |
| return lhs > rhs; |
| } |
| template <typename T> |
| inline bool GreaterEqualFn(T lhs, T rhs) { |
| return lhs >= rhs; |
| } |
| template <typename T> |
| inline bool LessFn(T lhs, T rhs) { |
| return lhs < rhs; |
| } |
| template <typename T> |
| inline bool LessEqualFn(T lhs, T rhs) { |
| return lhs <= rhs; |
| } |
| |
| template <typename T> |
| using ComparisonFn = bool (*)(T, T); |
| |
| template <typename T, ComparisonFn<T> F> |
| inline void ComparisonImpl( |
| const ComparisonParams& op_params, const RuntimeShape& input1_shape, |
| const T* input1_data, const RuntimeShape& input2_shape, |
| const T* input2_data, const RuntimeShape& output_shape, bool* output_data) { |
| const int64_t flatsize = |
| MatchingFlatSize(input1_shape, input2_shape, output_shape); |
| for (int64_t i = 0; i < flatsize; ++i) { |
| output_data[i] = F(input1_data[i], input2_data[i]); |
| } |
| } |
| |
| template <ComparisonFn<float> F> |
| inline void Comparison(const ComparisonParams& op_params, |
| const RuntimeShape& input1_shape, |
| const float* input1_data, |
| const RuntimeShape& input2_shape, |
| const float* input2_data, |
| const RuntimeShape& output_shape, bool* output_data) { |
| ComparisonImpl<float, F>(op_params, input1_shape, input1_data, input2_shape, |
| input2_data, output_shape, output_data); |
| } |
| |
| // TODO(b/80418076): Move to legacy ops file, update invocations. |
| // Legacy. |
| template <typename T, ComparisonFn<T> F> |
| inline void Comparison(const T* input1_data, const Dims<4>& input1_dims, |
| const T* input2_data, const Dims<4>& input2_dims, |
| bool* output_data, const Dims<4>& output_dims) { |
| ComparisonParams op_params; |
| // No parameters needed. |
| ComparisonImpl<T, F>(op_params, DimsToShape(input1_dims), input1_data, |
| DimsToShape(input2_dims), input2_data, |
| DimsToShape(output_dims), output_data); |
| } |
| |
| template <typename T, ComparisonFn<int32> F> |
| inline void ComparisonWithScaling( |
| const ComparisonParams& op_params, const RuntimeShape& input1_shape, |
| const T* input1_data, const RuntimeShape& input2_shape, |
| const T* input2_data, const RuntimeShape& output_shape, bool* output_data) { |
| int left_shift = op_params.left_shift; |
| int32 input1_offset = op_params.input1_offset; |
| int32 input1_multiplier = op_params.input1_multiplier; |
| int input1_shift = op_params.input1_shift; |
| int32 input2_offset = op_params.input2_offset; |
| int32 input2_multiplier = op_params.input2_multiplier; |
| int input2_shift = op_params.input2_shift; |
| |
| const int64_t flatsize = |
| MatchingFlatSize(input1_shape, input2_shape, output_shape); |
| for (int64_t i = 0; i < flatsize; ++i) { |
| const int32 input1_val = input1_offset + input1_data[i]; |
| const int32 input2_val = input2_offset + input2_data[i]; |
| const int32 shifted_input1_val = input1_val * (1 << left_shift); |
| const int32 shifted_input2_val = input2_val * (1 << left_shift); |
| const int32 scaled_input1_val = |
| MultiplyByQuantizedMultiplierSmallerThanOneExp( |
| shifted_input1_val, input1_multiplier, input1_shift); |
| const int32 scaled_input2_val = |
| MultiplyByQuantizedMultiplierSmallerThanOneExp( |
| shifted_input2_val, input2_multiplier, input2_shift); |
| output_data[i] = F(scaled_input1_val, scaled_input2_val); |
| } |
| } |
| |
| // TODO(b/80418076): Move to legacy ops file, update invocations. |
| // Legacy. |
| template <typename T, ComparisonFn<int32> F> |
| inline void Comparison(int left_shift, const T* input1_data, |
| const Dims<4>& input1_dims, int32 input1_offset, |
| int32 input1_multiplier, int input1_shift, |
| const T* input2_data, const Dims<4>& input2_dims, |
| int32 input2_offset, int32 input2_multiplier, |
| int input2_shift, bool* output_data, |
| const Dims<4>& output_dims) { |
| tflite::ComparisonParams op_params; |
| op_params.left_shift = left_shift; |
| op_params.input1_offset = input1_offset; |
| op_params.input1_multiplier = input1_multiplier; |
| // Legacy ops used mixed left and right shifts. Now all are +ve-means-left. |
| op_params.input1_shift = kReverseShift * input1_shift; |
| op_params.input2_offset = input2_offset; |
| op_params.input2_multiplier = input2_multiplier; |
| // Legacy ops used mixed left and right shifts. Now all are +ve-means-left. |
| op_params.input2_shift = kReverseShift * input2_shift; |
| |
| ComparisonWithScaling<T, F>(op_params, DimsToShape(input1_dims), input1_data, |
| DimsToShape(input2_dims), input2_data, |
| DimsToShape(output_dims), output_data); |
| } |
| |
| template <typename T, ComparisonFn<T> F> |
| inline void BroadcastComparison4DSlowImpl( |
| const ComparisonParams& op_params, |
| const RuntimeShape& unextended_input1_shape, const T* input1_data, |
| const RuntimeShape& unextended_input2_shape, const T* input2_data, |
| const RuntimeShape& unextended_output_shape, bool* output_data) { |
| gemmlowp::ScopedProfilingLabel label("BroadcastComparison4DSlow"); |
| TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); |
| const RuntimeShape output_shape = |
| RuntimeShape::ExtendedShape(4, unextended_output_shape); |
| |
| NdArrayDesc<4> desc1; |
| NdArrayDesc<4> desc2; |
| NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, |
| unextended_input2_shape, &desc1, &desc2); |
| |
| for (int b = 0; b < output_shape.Dims(0); ++b) { |
| for (int y = 0; y < output_shape.Dims(1); ++y) { |
| for (int x = 0; x < output_shape.Dims(2); ++x) { |
| for (int c = 0; c < output_shape.Dims(3); ++c) { |
| output_data[Offset(output_shape, b, y, x, c)] = |
| F(input1_data[SubscriptToIndex(desc1, b, y, x, c)], |
| input2_data[SubscriptToIndex(desc2, b, y, x, c)]); |
| } |
| } |
| } |
| } |
| } |
| template <ComparisonFn<float> F> |
| inline void BroadcastComparison4DSlow(const ComparisonParams& op_params, |
| const RuntimeShape& input1_shape, |
| const float* input1_data, |
| const RuntimeShape& input2_shape, |
| const float* input2_data, |
| const RuntimeShape& output_shape, |
| bool* output_data) { |
| BroadcastComparison4DSlowImpl<float, F>(op_params, input1_shape, input1_data, |
| input2_shape, input2_data, |
| output_shape, output_data); |
| } |
| |
| // TODO(b/80418076): Move to legacy ops file, update invocations. |
| // Legacy. |
| template <typename T, ComparisonFn<T> F> |
| inline void BroadcastComparison(const T* input1_data, |
| const Dims<4>& input1_dims, |
| const T* input2_data, |
| const Dims<4>& input2_dims, bool* output_data, |
| const Dims<4>& output_dims) { |
| ComparisonParams op_params; |
| // No parameters needed. |
| BroadcastComparison4DSlowImpl<T, F>(op_params, DimsToShape(input1_dims), |
| input1_data, DimsToShape(input2_dims), |
| input2_data, DimsToShape(output_dims), |
| output_data); |
| } |
| |
| template <typename T, ComparisonFn<int32> F> |
| inline void BroadcastComparison4DSlowWithScaling( |
| const ComparisonParams& op_params, |
| const RuntimeShape& unextended_input1_shape, const T* input1_data, |
| const RuntimeShape& unextended_input2_shape, const T* input2_data, |
| const RuntimeShape& unextended_output_shape, bool* output_data) { |
| gemmlowp::ScopedProfilingLabel label("BroadcastComparison4DSlowWithScaling"); |
| TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); |
| const RuntimeShape output_shape = |
| RuntimeShape::ExtendedShape(4, unextended_output_shape); |
| |
| NdArrayDesc<4> desc1; |
| NdArrayDesc<4> desc2; |
| NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, |
| unextended_input2_shape, &desc1, &desc2); |
| |
| int left_shift = op_params.left_shift; |
| int32 input1_offset = op_params.input1_offset; |
| int32 input1_multiplier = op_params.input1_multiplier; |
| int input1_shift = op_params.input1_shift; |
| int32 input2_offset = op_params.input2_offset; |
| int32 input2_multiplier = op_params.input2_multiplier; |
| int input2_shift = op_params.input2_shift; |
| |
| for (int b = 0; b < output_shape.Dims(0); ++b) { |
| for (int y = 0; y < output_shape.Dims(1); ++y) { |
| for (int x = 0; x < output_shape.Dims(2); ++x) { |
| for (int c = 0; c < output_shape.Dims(3); ++c) { |
| const int32 input1_val = |
| input1_offset + input1_data[SubscriptToIndex(desc1, b, y, x, c)]; |
| const int32 input2_val = |
| input2_offset + input2_data[SubscriptToIndex(desc2, b, y, x, c)]; |
| const int32 shifted_input1_val = input1_val * (1 << left_shift); |
| const int32 shifted_input2_val = input2_val * (1 << left_shift); |
| const int32 scaled_input1_val = |
| MultiplyByQuantizedMultiplierSmallerThanOneExp( |
| shifted_input1_val, input1_multiplier, input1_shift); |
| const int32 scaled_input2_val = |
| MultiplyByQuantizedMultiplierSmallerThanOneExp( |
| shifted_input2_val, input2_multiplier, input2_shift); |
| output_data[Offset(output_shape, b, y, x, c)] = |
| F(scaled_input1_val, scaled_input2_val); |
| } |
| } |
| } |
| } |
| } |
| |
| // TODO(b/80418076): Move to legacy ops file, update invocations. |
| // Legacy. |
| template <typename T, ComparisonFn<int32> F> |
| inline void BroadcastComparison(int left_shift, const T* input1_data, |
| const Dims<4>& input1_dims, int32 input1_offset, |
| int32 input1_multiplier, int input1_shift, |
| const T* input2_data, |
| const Dims<4>& input2_dims, int32 input2_offset, |
| int32 input2_multiplier, int input2_shift, |
| bool* output_data, const Dims<4>& output_dims) { |
| ComparisonParams op_params; |
| |
| op_params.left_shift = left_shift; |
| op_params.input1_offset = input1_offset; |
| op_params.input1_multiplier = input1_multiplier; |
| // Legacy ops used mixed left and right shifts. Now all are +ve-means-left. |
| op_params.input1_shift = kReverseShift * input1_shift; |
| op_params.input2_offset = input2_offset; |
| op_params.input2_multiplier = input2_multiplier; |
| // Legacy ops used mixed left and right shifts. Now all are +ve-means-left. |
| op_params.input2_shift = kReverseShift * input2_shift; |
| |
| BroadcastComparison4DSlowWithScaling<T, F>( |
| op_params, DimsToShape(input1_dims), input1_data, |
| DimsToShape(input2_dims), input2_data, DimsToShape(output_dims), |
| output_data); |
| } |
| |
| #define TFLITE_COMPARISON_OP(name) \ |
| template <typename T> \ |
| inline void name(const T* input1_data, const Dims<4>& input1_dims, \ |
| const T* input2_data, const Dims<4>& input2_dims, \ |
| bool* output_data, const Dims<4>& output_dims) { \ |
| gemmlowp::ScopedProfilingLabel label(#name); \ |
| Comparison<T, name##Fn>(input1_data, input1_dims, input2_data, \ |
| input2_dims, output_data, output_dims); \ |
| } \ |
| template <typename T> \ |
| inline void name( \ |
| int left_shift, const T* input1_data, const Dims<4>& input1_dims, \ |
| int32 input1_offset, int32 input1_multiplier, int input1_shift, \ |
| const T* input2_data, const Dims<4>& input2_dims, int32 input2_offset, \ |
| int32 input2_multiplier, int input2_shift, bool* output_data, \ |
| const Dims<4>& output_dims) { \ |
| gemmlowp::ScopedProfilingLabel label(#name "/8bit"); \ |
| Comparison<T, name##Fn>(left_shift, input1_data, input1_dims, \ |
| input1_offset, input1_multiplier, input1_shift, \ |
| input2_data, input2_dims, input2_offset, \ |
| input2_multiplier, input2_shift, output_data, \ |
| output_dims); \ |
| } \ |
| template <typename T> \ |
| inline void Broadcast##name( \ |
| const T* input1_data, const Dims<4>& input1_dims, const T* input2_data, \ |
| const Dims<4>& input2_dims, bool* output_data, \ |
| const Dims<4>& output_dims) { \ |
| gemmlowp::ScopedProfilingLabel label("Broadcast" #name); \ |
| BroadcastComparison<T, name##Fn>(input1_data, input1_dims, input2_data, \ |
| input2_dims, output_data, output_dims); \ |
| } \ |
| template <typename T> \ |
| inline void Broadcast##name( \ |
| int left_shift, const T* input1_data, const Dims<4>& input1_dims, \ |
| int32 input1_offset, int32 input1_multiplier, int input1_shift, \ |
| const T* input2_data, const Dims<4>& input2_dims, int32 input2_offset, \ |
| int32 input2_multiplier, int input2_shift, bool* output_data, \ |
| const Dims<4>& output_dims) { \ |
| gemmlowp::ScopedProfilingLabel label("Broadcast" #name "/8bit"); \ |
| BroadcastComparison<T, name##Fn>(left_shift, input1_data, input1_dims, \ |
| input1_offset, input1_multiplier, \ |
| input1_shift, input2_data, input2_dims, \ |
| input2_offset, input2_multiplier, \ |
| input2_shift, output_data, output_dims); \ |
| } \ |
| inline void name(const ComparisonParams& op_params, \ |
| const RuntimeShape& input1_shape, const float* input1_data, \ |
| const RuntimeShape& input2_shape, const float* input2_data, \ |
| const RuntimeShape& output_shape, bool* output_data) { \ |
| gemmlowp::ScopedProfilingLabel label(#name); \ |
| Comparison<name##Fn>(op_params, input1_shape, input1_data, input2_shape, \ |
| input2_data, output_shape, output_data); \ |
| } \ |
| template <typename T> \ |
| inline void name##WithScaling( \ |
| const ComparisonParams& op_params, const RuntimeShape& input1_shape, \ |
| const T* input1_data, const RuntimeShape& input2_shape, \ |
| const T* input2_data, const RuntimeShape& output_shape, \ |
| bool* output_data) { \ |
| gemmlowp::ScopedProfilingLabel label(#name "/8bit"); \ |
| ComparisonWithScaling<T, name##Fn>(op_params, input1_shape, input1_data, \ |
| input2_shape, input2_data, \ |
| output_shape, output_data); \ |
| } \ |
| inline void Broadcast4DSlow##name( \ |
| const ComparisonParams& op_params, const RuntimeShape& input1_shape, \ |
| const float* input1_data, const RuntimeShape& input2_shape, \ |
| const float* input2_data, const RuntimeShape& output_shape, \ |
| bool* output_data) { \ |
| gemmlowp::ScopedProfilingLabel label("Broadcast" #name); \ |
| BroadcastComparison4DSlow<name##Fn>(op_params, input1_shape, input1_data, \ |
| input2_shape, input2_data, \ |
| output_shape, output_data); \ |
| } \ |
| template <typename T> \ |
| inline void Broadcast4DSlow##name##WithScaling( \ |
| const ComparisonParams& op_params, const RuntimeShape& input1_shape, \ |
| const T* input1_data, const RuntimeShape& input2_shape, \ |
| const T* input2_data, const RuntimeShape& output_shape, \ |
| bool* output_data) { \ |
| gemmlowp::ScopedProfilingLabel label("Broadcast" #name "/8bit"); \ |
| BroadcastComparison4DSlowWithScaling<T, name##Fn>( \ |
| op_params, input1_shape, input1_data, input2_shape, input2_data, \ |
| output_shape, output_data); \ |
| } |
| TFLITE_COMPARISON_OP(Equal); |
| TFLITE_COMPARISON_OP(NotEqual); |
| TFLITE_COMPARISON_OP(Greater); |
| TFLITE_COMPARISON_OP(GreaterEqual); |
| TFLITE_COMPARISON_OP(Less); |
| TFLITE_COMPARISON_OP(LessEqual); |
| #undef TFLITE_COMPARISON_OP |
| |
| template <typename D, typename T> |
| void Select(const RuntimeShape& input_condition_shape, |
| const D* input_condition_data, const RuntimeShape& input_x_shape, |
| const T* input_x_data, const RuntimeShape& input_y_shape, |
| const T* input_y_data, const RuntimeShape& output_shape, |
| T* output_data) { |
| const int64_t flatsize = MatchingFlatSize( |
| input_condition_shape, input_x_shape, input_y_shape, output_shape); |
| for (int64_t i = 0; i < flatsize; ++i) { |
| output_data[i] = |
| input_condition_data[i] ? input_x_data[i] : input_y_data[i]; |
| } |
| } |
| |
| // TODO(b/80418076): Move to legacy ops file, update invocations. |
| // Legacy. |
| template <typename D, typename T> |
| inline void Select(const D* input_condition_data, |
| const Dims<4>& input_condition_dims, const T* input_x_data, |
| const Dims<4>& input_x_dims, const T* input_y_data, |
| const Dims<4>& input_y_dims, T* output_data, |
| const Dims<4>& output_dims) { |
| Select(DimsToShape(input_condition_dims), input_condition_data, |
| DimsToShape(input_x_dims), input_x_data, DimsToShape(input_y_dims), |
| input_y_data, DimsToShape(output_dims), output_data); |
| } |
| |
| template <typename D, typename T> |
| void RankOneSelect(const RuntimeShape& input_condition_shape, |
| const D* input_condition_data, |
| const RuntimeShape& input_x_shape, const T* input_x_data, |
| const RuntimeShape& input_y_shape, const T* input_y_data, |
| const RuntimeShape& output_shape, T* output_data) { |
| const int64_t outer_size = input_condition_shape.FlatSize(); |
| TFLITE_DCHECK_EQ( |
| MatchingDim(input_x_shape, 0, input_y_shape, 0, output_shape, 0), |
| outer_size); |
| const int64_t inner_size = |
| MatchingFlatSizeSkipDim(input_x_shape, 0, input_y_shape, output_shape); |
| |
| int64_t offset = 0; |
| for (int64_t i = 0; i < outer_size; i++) { |
| const T* input_data = input_condition_data[i] ? input_x_data : input_y_data; |
| memcpy(output_data + offset, input_data + offset, inner_size * sizeof(T)); |
| offset += inner_size; |
| } |
| } |
| |
| // TODO(b/80418076): Move to legacy ops file, update invocations. |
| // Legacy. |
| template <typename D, typename T> |
| inline void RankOneSelect(const D* input_condition_data, |
| const Dims<4>& input_condition_dims, |
| const T* input_x_data, const Dims<4>& input_x_dims, |
| const T* input_y_data, const Dims<4>& input_y_dims, |
| T* output_data, const Dims<4>& output_dims) { |
| RankOneSelect(DimsToShape(input_condition_dims), input_condition_data, |
| DimsToShape(input_x_dims), input_x_data, |
| DimsToShape(input_y_dims), input_y_data, |
| DimsToShape(output_dims), output_data); |
| } |
| |
| // For easy implementation, the indices is always a vector of size-4 vectors. |
| template <typename T, typename TI> |
| inline void SparseToDense(const std::vector<std::vector<TI>>& indices, |
| const T* values, T default_value, |
| bool value_is_scalar, |
| const RuntimeShape& unextended_output_shape, |
| T* output_data) { |
| TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); |
| const RuntimeShape output_shape = |
| RuntimeShape::ExtendedShape(4, unextended_output_shape); |
| const int value_count = indices.size(); |
| |
| // First fill the output_data with default value. |
| const int num_elements = output_shape.FlatSize(); |
| for (int i = 0; i < num_elements; ++i) { |
| output_data[i] = default_value; |
| } |
| |
| // Special handle for value is scalar case to avoid checking the boolean |
| // condition within the loop every time. |
| if (value_is_scalar) { |
| for (int i = 0; i < value_count; ++i) { |
| const std::vector<TI>& index = indices[i]; |
| TFLITE_DCHECK_EQ(index.size(), 4); |
| const T value = *values; // just use the first value. |
| output_data[Offset(output_shape, index[0], index[1], index[2], |
| index[3])] = value; |
| } |
| return; |
| } |
| |
| // Go through the values and indices to fill the sparse values. |
| for (int i = 0; i < value_count; ++i) { |
| const std::vector<TI>& index = indices[i]; |
| TFLITE_DCHECK_EQ(index.size(), 4); |
| const T value = values[i]; |
| output_data[Offset(output_shape, index[0], index[1], index[2], index[3])] = |
| value; |
| } |
| } |
| |
| // TODO(b/80418076): Move to legacy ops file, update invocations. |
| // Legacy. |
| template <typename T, typename TI> |
| inline void SparseToDense(const std::vector<std::vector<TI>>& indices, |
| const T* values, T default_value, T* output_data, |
| const Dims<4>& output_dims, bool value_is_scalar) { |
| SparseToDense(indices, values, default_value, value_is_scalar, |
| DimsToShape(output_dims), output_data); |
| } |
| |
| template <typename T> |
| inline void Pow(const RuntimeShape& input1_shape, const T* input1_data, |
| const RuntimeShape& input2_shape, const T* input2_data, |
| const RuntimeShape& output_shape, T* output_data) { |
| const int flat_size = |
| MatchingFlatSize(input1_shape, input2_shape, output_shape); |
| for (int i = 0; i < flat_size; ++i) { |
| output_data[i] = std::pow(input1_data[i], input2_data[i]); |
| } |
| } |
| |
| template <typename T> |
| inline void BroadcastPow4DSlow(const RuntimeShape& unextended_input1_shape, |
| const T* input1_data, |
| const RuntimeShape& unextended_input2_shape, |
| const T* input2_data, |
| const RuntimeShape& unextended_output_shape, |
| T* output_data) { |
| TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); |
| const RuntimeShape output_shape = |
| RuntimeShape::ExtendedShape(4, unextended_output_shape); |
| |
| NdArrayDesc<4> desc1; |
| NdArrayDesc<4> desc2; |
| NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, |
| unextended_input2_shape, &desc1, &desc2); |
| |
| for (int b = 0; b < output_shape.Dims(0); ++b) { |
| for (int y = 0; y < output_shape.Dims(1); ++y) { |
| for (int x = 0; x < output_shape.Dims(2); ++x) { |
| for (int c = 0; c < output_shape.Dims(3); ++c) { |
| auto out_idx = Offset(output_shape, b, y, x, c); |
| auto in1_idx = SubscriptToIndex(desc1, b, y, x, c); |
| auto in2_idx = SubscriptToIndex(desc2, b, y, x, c); |
| auto in1_val = input1_data[in1_idx]; |
| auto in2_val = input2_data[in2_idx]; |
| output_data[out_idx] = std::pow(in1_val, in2_val); |
| } |
| } |
| } |
| } |
| } |
| |
| inline void Logical(const RuntimeShape& input1_shape, const bool* input1_data, |
| const RuntimeShape& input2_shape, const bool* input2_data, |
| const RuntimeShape& output_shape, bool* output_data, |
| const std::function<bool(bool, bool)>& func) { |
| const int flat_size = |
| MatchingFlatSize(input1_shape, input2_shape, output_shape); |
| for (int i = 0; i < flat_size; ++i) { |
| output_data[i] = func(input1_data[i], input2_data[i]); |
| } |
| } |
| |
| inline void BroadcastLogical4DSlow( |
| const RuntimeShape& unextended_input1_shape, const bool* input1_data, |
| const RuntimeShape& unextended_input2_shape, const bool* input2_data, |
| const RuntimeShape& unextended_output_shape, bool* output_data, |
| const std::function<bool(bool, bool)>& func) { |
| TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); |
| const RuntimeShape output_shape = |
| RuntimeShape::ExtendedShape(4, unextended_output_shape); |
| |
| NdArrayDesc<4> desc1; |
| NdArrayDesc<4> desc2; |
| NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, |
| unextended_input2_shape, &desc1, &desc2); |
| |
| for (int b = 0; b < output_shape.Dims(0); ++b) { |
| for (int y = 0; y < output_shape.Dims(1); ++y) { |
| for (int x = 0; x < output_shape.Dims(2); ++x) { |
| for (int c = 0; c < output_shape.Dims(3); ++c) { |
| auto out_idx = Offset(output_shape, b, y, x, c); |
| auto in1_idx = SubscriptToIndex(desc1, b, y, x, c); |
| auto in2_idx = SubscriptToIndex(desc2, b, y, x, c); |
| auto in1_val = input1_data[in1_idx]; |
| auto in2_val = input2_data[in2_idx]; |
| output_data[out_idx] = func(in1_val, in2_val); |
| } |
| } |
| } |
| } |
| } |
| |
| // TODO(ycling): Refactoring. Remove BroadcastLogical and use the more |
| // generalized and efficient BroadcastBinaryFunction. |
| // |
| // Also appears to duplicte MinimumMaximum. |
| // |
| // R: Result type. T1: Input 1 type. T2: Input 2 type. |
| template <typename R, typename T1, typename T2> |
| inline void BroadcastBinaryFunction4DSlow( |
| const RuntimeShape& unextended_input1_shape, const T1* input1_data, |
| const RuntimeShape& unextended_input2_shape, const T2* input2_data, |
| const RuntimeShape& unextended_output_shape, R* output_data, |
| R (*func)(T1, T2)) { |
| TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4); |
| TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); |
| const RuntimeShape output_shape = |
| RuntimeShape::ExtendedShape(4, unextended_output_shape); |
| |
| NdArrayDesc<4> desc1; |
| NdArrayDesc<4> desc2; |
| NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, |
| unextended_input2_shape, &desc1, &desc2); |
| |
| for (int b = 0; b < output_shape.Dims(0); ++b) { |
| for (int y = 0; y < output_shape.Dims(1); ++y) { |
| for (int x = 0; x < output_shape.Dims(2); ++x) { |
| for (int c = 0; c < output_shape.Dims(3); ++c) { |
| auto out_idx = Offset(output_shape, b, y, x, c); |
| auto in1_idx = SubscriptToIndex(desc1, b, y, x, c); |
| auto in2_idx = SubscriptToIndex(desc2, b, y, x, c); |
| auto in1_val = input1_data[in1_idx]; |
| auto in2_val = input2_data[in2_idx]; |
| output_data[out_idx] = func(in1_val, in2_val); |
| } |
| } |
| } |
| } |
| } |
| |
| // R: Result type. T1: Input 1 type. T2: Input 2 type. |
| // TODO(renjieliu): Refactor other binary functions to use this one. |
| template <typename R, typename T1, typename T2> |
| inline void BinaryFunction(const RuntimeShape& input1_shape, |
| const T1* input1_data, |
| const RuntimeShape& input2_shape, |
| const T2* input2_data, |
| const RuntimeShape& output_shape, R* output_data, |
| R (*func)(T1, T2)) { |
| const int flat_size = |
| MatchingFlatSize(input1_shape, input2_shape, output_shape); |
| for (int i = 0; i < flat_size; ++i) { |
| output_data[i] = func(input1_data[i], input2_data[i]); |
| } |
| } |
| |
| } // namespace reference_ops |
| } // namespace tflite |
| |
| #endif // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_ |