| /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. |
| |
| Licensed under the Apache License, Version 2.0 (the "License"); |
| you may not use this file except in compliance with the License. |
| You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| ==============================================================================*/ |
| #ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_ |
| #define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_ |
| |
| #include "fixedpoint/fixedpoint.h" |
| #include "public/gemmlowp.h" |
| #include "tensorflow/contrib/lite/kernels/internal/common.h" |
| #include "tensorflow/contrib/lite/kernels/internal/types.h" |
| |
| namespace tflite { |
| namespace optimized_ops { |
| |
| // Implementation of quantized DepthwiseConv |
| |
| template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier> |
| struct QuantizedDepthwiseConvKernel {}; |
| |
| #ifdef USE_NEON |
| template <> |
| struct QuantizedDepthwiseConvKernel<true, 8, 2> { |
| static void Run(int num_output_pixels, int input_depth, int depth_multiplier, |
| const uint8* input_ptr, int16 input_offset, |
| int input_ptr_increment, const uint8* filter_ptr, |
| int16 filter_offset, int32* acc_buffer_ptr) { |
| // Load the filters, add filter_offset. |
| uint8x8x2_t filter_u8; |
| filter_u8.val[0] = vld1_u8(filter_ptr); |
| filter_u8.val[1] = vld1_u8(filter_ptr + 8); |
| int16x8_t filter[2]; |
| for (int i = 0; i < 2; i++) { |
| filter[i] = vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])), |
| vdupq_n_s16(filter_offset)); |
| } |
| // Handle one output pixel at a time. |
| for (int outp = 0; outp < num_output_pixels; outp++) { |
| // Load the accumulators from acc_buffer |
| int32x4x2_t acc[2]; |
| for (int i = 0; i < 2; i++) { |
| acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i); |
| acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8); |
| } |
| // Load the inputs, add input_offset. |
| const uint8x8_t input_u8 = vld1_u8(input_ptr); |
| input_ptr += input_ptr_increment; |
| const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); |
| const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); |
| // Duplicate the input values, 2-fold |
| const int16x8x2_t input_dup2 = vzipq_s16(input, input); |
| // Multiply-accumulate |
| for (int i = 0; i < 2; i++) { |
| acc[0].val[i] = vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]), |
| vget_low_s16(input_dup2.val[i])); |
| acc[1].val[i] = vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]), |
| vget_high_s16(input_dup2.val[i])); |
| } |
| // Store the accumulators back to acc_buffer |
| for (int i = 0; i < 2; i++) { |
| vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]); |
| vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]); |
| } |
| acc_buffer_ptr += 16; |
| } |
| } |
| }; |
| |
| template <> |
| struct QuantizedDepthwiseConvKernel<false, 8, 1> { |
| static void Run(int num_output_pixels, int input_depth, int depth_multiplier, |
| const uint8* input_ptr, int16 input_offset, |
| int input_ptr_increment, const uint8* filter_ptr, |
| int16 filter_offset, int32* acc_buffer_ptr) { |
| // Load the filters, add filter_offset. |
| const uint8x8_t filter_u8 = vld1_u8(filter_ptr); |
| const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8)); |
| const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); |
| |
| int outp = 0; |
| // Handle 2 output pixels at a time. |
| for (; outp <= num_output_pixels - 2; outp += 2) { |
| // Load the accumulators from acc_buffer. |
| int32x4_t acc[4]; |
| for (int i = 0; i < 4; i++) { |
| acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); |
| } |
| // Load the inputs, add input_offset. |
| uint8x8_t input_u8[2]; |
| for (int i = 0; i < 2; i++) { |
| input_u8[i] = vld1_u8(input_ptr + 8 * i); |
| } |
| input_ptr += 16; |
| int16x8_t input[2]; |
| for (int i = 0; i < 2; i++) { |
| input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i])); |
| } |
| for (int i = 0; i < 2; i++) { |
| input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset)); |
| } |
| // Multiply-accumulate. |
| acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input[0])); |
| acc[1] = |
| vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input[0])); |
| acc[2] = vmlal_s16(acc[2], vget_low_s16(filter), vget_low_s16(input[1])); |
| acc[3] = |
| vmlal_s16(acc[3], vget_high_s16(filter), vget_high_s16(input[1])); |
| // Store the accumulators back to acc_buffer |
| for (int i = 0; i < 4; i++) { |
| vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); |
| } |
| acc_buffer_ptr += 16; |
| } |
| // Handle 1 output pixel at a time. |
| for (; outp < num_output_pixels; outp++) { |
| // Load the accumulators from acc_buffer. |
| int32x4_t acc[2]; |
| acc[0] = vld1q_s32(acc_buffer_ptr); |
| acc[1] = vld1q_s32(acc_buffer_ptr + 4); |
| |
| // Load the inputs, add input_offset. |
| const uint8x8_t input_u8 = vld1_u8(input_ptr); |
| input_ptr += 8; |
| const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); |
| const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); |
| // Multiply-accumulate. |
| acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input)); |
| acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input)); |
| // Store the accumulators back to acc_buffer |
| vst1q_s32(acc_buffer_ptr, acc[0]); |
| vst1q_s32(acc_buffer_ptr + 4, acc[1]); |
| acc_buffer_ptr += 8; |
| } |
| } |
| }; |
| |
| template <> |
| struct QuantizedDepthwiseConvKernel<false, 4, 2> { |
| static void Run(int num_output_pixels, int input_depth, int depth_multiplier, |
| const uint8* input_ptr, int16 input_offset, |
| int input_ptr_increment, const uint8* filter_ptr, |
| int16 filter_offset, int32* acc_buffer_ptr) { |
| // Load the filters, add filter_offset. |
| const uint8x8_t filter_u8 = vld1_u8(filter_ptr); |
| const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8)); |
| const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); |
| |
| int outp = 0; |
| // Handle 2 output pixels at a time. |
| for (; outp <= num_output_pixels - 2; outp += 2) { |
| // Load the accumulators from acc_buffer |
| int32x4_t acc[4]; |
| for (int i = 0; i < 4; i++) { |
| acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); |
| } |
| // Load the inputs, add input_offset. |
| const uint8x8_t input_u8 = vld1_u8(input_ptr); |
| input_ptr += 8; |
| const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); |
| const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); |
| // Duplicate the input values, 2-fold |
| const int16x8x2_t input_dup2 = vzipq_s16(input, input); |
| // Multiply-accumulate |
| for (int i = 0; i < 2; i++) { |
| acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(filter), |
| vget_low_s16(input_dup2.val[i])); |
| acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(filter), |
| vget_high_s16(input_dup2.val[i])); |
| } |
| // Store the accumulators back to acc_buffer |
| for (int i = 0; i < 4; i++) { |
| vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); |
| } |
| acc_buffer_ptr += 16; |
| } |
| // Handle one output pixel at a time. |
| for (; outp < num_output_pixels; outp++) { |
| // Load the accumulators from acc_buffer |
| int32x4_t acc[2]; |
| for (int i = 0; i < 2; i++) { |
| acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); |
| } |
| // Load the inputs, add input_offset. |
| uint8x8_t input_u8 = vdup_n_u8(0); |
| input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); |
| input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); |
| input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); |
| input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); |
| input_ptr += 4; |
| const int16x4_t input_s16 = |
| vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); |
| const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); |
| // Duplicate the input values, 2-fold |
| const int16x4x2_t input_dup2 = vzip_s16(input, input); |
| // Multiply-accumulate |
| acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), input_dup2.val[0]); |
| acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), input_dup2.val[1]); |
| // Store the accumulators back to acc_buffer |
| for (int i = 0; i < 2; i++) { |
| vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); |
| } |
| acc_buffer_ptr += 8; |
| } |
| } |
| }; |
| |
| template <> |
| struct QuantizedDepthwiseConvKernel<false, 2, 8> { |
| static void Run(int num_output_pixels, int input_depth, int depth_multiplier, |
| const uint8* input_ptr, int16 input_offset, |
| int input_ptr_increment, const uint8* filter_ptr, |
| int16 filter_offset, int32* acc_buffer_ptr) { |
| // Load the filters, add filter_offset. |
| int16x8_t filter[2]; |
| for (int i = 0; i < 2; i++) { |
| const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i); |
| const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8)); |
| filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); |
| } |
| int outp = 0; |
| // Handle two output pixels at a time. |
| for (; outp <= num_output_pixels - 2; outp += 2) { |
| // Load the accumulators from acc_buffer. |
| int32x4_t acc[8]; |
| for (int i = 0; i < 8; i++) { |
| acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); |
| } |
| // Load the inputs, add input_offset. |
| uint8x8_t input_u8 = vdup_n_u8(0); |
| input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); |
| input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); |
| input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); |
| input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); |
| input_ptr += 4; |
| const int16x4_t input_s16 = |
| vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); |
| const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); |
| // Multiply-accumulate. |
| acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0); |
| acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0); |
| acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1); |
| acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1); |
| acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), input, 2); |
| acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), input, 2); |
| acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), input, 3); |
| acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), input, 3); |
| // Store the accumulators back to acc_buffer. |
| for (int i = 0; i < 8; i++) { |
| vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); |
| } |
| acc_buffer_ptr += 32; |
| } |
| // Handle one output pixel at a time. |
| for (; outp < num_output_pixels; outp++) { |
| // Load the accumulators from acc_buffer. |
| int32x4_t acc[4]; |
| for (int i = 0; i < 4; i++) { |
| acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); |
| } |
| // Load the inputs, add input_offset. |
| uint8x8_t input_u8 = vdup_n_u8(0); |
| input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); |
| input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); |
| input_ptr += 2; |
| const int16x4_t input_s16 = |
| vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); |
| const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); |
| |
| // Multiply-accumulate. |
| acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0); |
| acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0); |
| acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1); |
| acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1); |
| |
| // Store the accumulators back to acc_buffer. |
| for (int i = 0; i < 4; i++) { |
| vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); |
| } |
| acc_buffer_ptr += 16; |
| } |
| } |
| }; |
| |
| template <> |
| struct QuantizedDepthwiseConvKernel<false, 2, 2> { |
| static void Run(int num_output_pixels, int input_depth, int depth_multiplier, |
| const uint8* input_ptr, int16 input_offset, |
| int input_ptr_increment, const uint8* filter_ptr, |
| int16 filter_offset, int32* acc_buffer_ptr) { |
| // Load the filters, add filter_offset. |
| uint8x8_t filter_u8 = vdup_n_u8(0); |
| filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0); |
| filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1); |
| filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2); |
| filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3); |
| const int16x4_t filter_s16 = |
| vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8))); |
| const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset)); |
| |
| int outp = 0; |
| // Handle 4 output pixels at a time. |
| for (; outp <= num_output_pixels - 4; outp += 4) { |
| // Load the accumulators from acc_buffer |
| int32x4_t acc[4]; |
| for (int i = 0; i < 4; i++) { |
| acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); |
| } |
| |
| // Load the inputs, add input_offset. |
| const uint8x8_t input_u8 = vld1_u8(input_ptr); |
| input_ptr += 8; |
| const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); |
| const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); |
| // Duplicate the input values, 2-fold |
| const int16x8x2_t input_dup2 = vzipq_s16(input, input); |
| // Multiply-accumulate |
| acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0])); |
| acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0])); |
| acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1])); |
| acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1])); |
| // Store the accumulators back to acc_buffer |
| for (int i = 0; i < 4; i++) { |
| vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); |
| } |
| acc_buffer_ptr += 16; |
| } |
| // Handle one output pixel at a time. |
| for (; outp < num_output_pixels; outp++) { |
| // Load the accumulators from acc_buffer |
| int32x4_t acc = vld1q_s32(acc_buffer_ptr); |
| |
| uint8x8_t input_u8 = vdup_n_u8(0); |
| input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); |
| input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); |
| input_ptr += 2; |
| const int16x4_t input_s16 = |
| vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); |
| const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); |
| // Duplicate the input values, 2-fold |
| const int16x4_t input_dup2 = vzip_s16(input, input).val[0]; |
| // Multiply-accumulate |
| acc = vmlal_s16(acc, filter, input_dup2); |
| // Store the accumulators back to acc_buffer |
| vst1q_s32(acc_buffer_ptr, acc); |
| acc_buffer_ptr += 4; |
| } |
| } |
| }; |
| |
| template <> |
| struct QuantizedDepthwiseConvKernel<false, 2, 1> { |
| static void Run(int num_output_pixels, int input_depth, int depth_multiplier, |
| const uint8* input_ptr, int16 input_offset, |
| int input_ptr_increment, const uint8* filter_ptr, |
| int16 filter_offset, int32* acc_buffer_ptr) { |
| // Load the filters, add filter_offset. |
| uint8x8_t filter_u8 = vdup_n_u8(0); |
| filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0); |
| filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1); |
| filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2); |
| filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3); |
| const int16x4_t filter_s16 = |
| vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8))); |
| const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset)); |
| |
| int outp = 0; |
| // Handle 8 output pixels at a time. |
| for (; outp <= num_output_pixels - 8; outp += 8) { |
| // Load the accumulators from acc_buffer. |
| int32x4_t acc[4]; |
| for (int i = 0; i < 4; i++) { |
| acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); |
| } |
| // Load the inputs, add input_offset. |
| uint8x8_t input_u8[2]; |
| for (int i = 0; i < 2; i++) { |
| input_u8[i] = vld1_u8(input_ptr + 8 * i); |
| } |
| input_ptr += 16; |
| int16x8_t input[2]; |
| for (int i = 0; i < 2; i++) { |
| input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i])); |
| } |
| for (int i = 0; i < 2; i++) { |
| input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset)); |
| } |
| |
| // Multiply-accumulate. |
| acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input[0])); |
| acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input[0])); |
| acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input[1])); |
| acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input[1])); |
| // Store the accumulators back to acc_buffer. |
| for (int i = 0; i < 4; i++) { |
| vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); |
| } |
| acc_buffer_ptr += 16; |
| } |
| // Handle 4 output pixels at a time. |
| for (; outp <= num_output_pixels - 4; outp += 4) { |
| // Load the accumulators from acc_buffer. |
| int32x4_t acc[2]; |
| for (int i = 0; i < 2; i++) { |
| acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); |
| } |
| // Load the inputs, add input_offset. |
| const uint8x8_t input_u8 = vld1_u8(input_ptr); |
| input_ptr += 8; |
| const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); |
| const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); |
| |
| // Multiply-accumulate. |
| acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input)); |
| acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input)); |
| // Store the accumulators back to acc_buffer. |
| for (int i = 0; i < 2; i++) { |
| vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); |
| } |
| acc_buffer_ptr += 8; |
| } |
| // Handle 2 output pixels at a time. |
| for (; outp <= num_output_pixels - 2; outp += 2) { |
| // Load the accumulators from acc_buffer. |
| int32x4_t acc = vld1q_s32(acc_buffer_ptr); |
| // Load the inputs, add input_offset. |
| uint8x8_t input_u8 = vdup_n_u8(0); |
| input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); |
| input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); |
| input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); |
| input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); |
| input_ptr += 4; |
| const int16x4_t input_s16 = |
| vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); |
| const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); |
| |
| // Multiply-accumulate. |
| acc = vmlal_s16(acc, filter, input); |
| // Store the accumulators back to acc_buffer. |
| vst1q_s32(acc_buffer_ptr, acc); |
| acc_buffer_ptr += 4; |
| } |
| // Handle 1 output pixel at a time. |
| for (; outp < num_output_pixels; outp++) { |
| // Load the accumulators from acc_buffer. |
| int32x2_t acc = vld1_s32(acc_buffer_ptr); |
| // Load the inputs, add input_offset. |
| uint8x8_t input_u8 = vdup_n_u8(0); |
| input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); |
| input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); |
| input_ptr += 2; |
| const int16x4_t input_s16 = |
| vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); |
| const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); |
| |
| // Multiply-accumulate. |
| acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input)); |
| // Store the accumulators back to acc_buffer. |
| vst1_s32(acc_buffer_ptr, acc); |
| acc_buffer_ptr += 2; |
| } |
| } |
| }; |
| |
| template <> |
| struct QuantizedDepthwiseConvKernel<false, 1, 2> { |
| static void Run(int num_output_pixels, int input_depth, int depth_multiplier, |
| const uint8* input_ptr, int16 input_offset, |
| int input_ptr_increment, const uint8* filter_ptr, |
| int16 filter_offset, int32* acc_buffer_ptr) { |
| // Load the filters, add filter_offset. |
| uint8x8_t filter_u8 = vdup_n_u8(0); |
| filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0); |
| filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1); |
| filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2); |
| filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3); |
| const int16x4_t filter_s16 = |
| vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8))); |
| const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset)); |
| |
| int outp = 0; |
| // Handle 8 output pixels at a time. |
| for (; outp <= num_output_pixels - 8; outp += 8) { |
| // Load the accumulators from acc_buffer |
| int32x4_t acc[4]; |
| for (int i = 0; i < 4; i++) { |
| acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); |
| } |
| |
| // Load the inputs, add input_offset. |
| const uint8x8_t input_u8 = vld1_u8(input_ptr); |
| input_ptr += 8; |
| const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); |
| const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); |
| // Duplicate the input values, 2-fold |
| const int16x8x2_t input_dup2 = vzipq_s16(input, input); |
| // Multiply-accumulate |
| acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0])); |
| acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0])); |
| acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1])); |
| acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1])); |
| // Store the accumulators back to acc_buffer |
| for (int i = 0; i < 4; i++) { |
| vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); |
| } |
| acc_buffer_ptr += 16; |
| } |
| // Handle one output pixel at a time. |
| for (; outp < num_output_pixels; outp++) { |
| // Load the accumulators from acc_buffer |
| int32x2_t acc = vld1_s32(acc_buffer_ptr); |
| |
| // Load the inputs, add input_offset. |
| const uint32 input = *input_ptr++ + input_offset; |
| |
| // Multiply-accumulate |
| acc = vget_low_s32(vmlal_n_s16(vcombine_s32(acc, acc), filter, input)); |
| // Store the accumulators back to acc_buffer |
| vst1_s32(acc_buffer_ptr, acc); |
| acc_buffer_ptr += 2; |
| } |
| } |
| }; |
| |
| template <> |
| struct QuantizedDepthwiseConvKernel<false, 1, 4> { |
| static void Run(int num_output_pixels, int input_depth, int depth_multiplier, |
| const uint8* input_ptr, int16 input_offset, |
| int input_ptr_increment, const uint8* filter_ptr, |
| int16 filter_offset, int32* acc_buffer_ptr) { |
| // Load the filters, add filter_offset. |
| uint8x8_t filter_u8 = vdup_n_u8(0); |
| filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0); |
| filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1); |
| filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2); |
| filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3); |
| const int16x4_t filter_s16 = |
| vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8))); |
| const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset)); |
| |
| int outp = 0; |
| // Handle 8 output pixels at a time. |
| for (; outp <= num_output_pixels - 8; outp += 8) { |
| // Load the accumulators from acc_buffer |
| int32x4_t acc[8]; |
| for (int i = 0; i < 8; i++) { |
| acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); |
| } |
| |
| // Load the inputs, add input_offset. |
| uint8x8_t input_u8 = vld1_u8(input_ptr); |
| input_ptr += 8; |
| const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); |
| const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); |
| |
| // Multiply-accumulate |
| acc[0] = vmlal_lane_s16(acc[0], filter, vget_low_s16(input), 0); |
| acc[1] = vmlal_lane_s16(acc[1], filter, vget_low_s16(input), 1); |
| acc[2] = vmlal_lane_s16(acc[2], filter, vget_low_s16(input), 2); |
| acc[3] = vmlal_lane_s16(acc[3], filter, vget_low_s16(input), 3); |
| acc[4] = vmlal_lane_s16(acc[4], filter, vget_high_s16(input), 0); |
| acc[5] = vmlal_lane_s16(acc[5], filter, vget_high_s16(input), 1); |
| acc[6] = vmlal_lane_s16(acc[6], filter, vget_high_s16(input), 2); |
| acc[7] = vmlal_lane_s16(acc[7], filter, vget_high_s16(input), 3); |
| |
| // Store the accumulators back to acc_buffer |
| for (int i = 0; i < 8; i++) { |
| vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); |
| } |
| acc_buffer_ptr += 32; |
| } |
| // Handle 4 output pixels at a time. |
| for (; outp <= num_output_pixels - 4; outp += 4) { |
| // Load the accumulators from acc_buffer |
| int32x4_t acc[4]; |
| for (int i = 0; i < 4; i++) { |
| acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); |
| } |
| |
| // Load the inputs, add input_offset. |
| uint8x8_t input_u8 = vdup_n_u8(0); |
| input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); |
| input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); |
| input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); |
| input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); |
| input_ptr += 4; |
| const int16x4_t input_s16 = |
| vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); |
| const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); |
| |
| // Multiply-accumulate |
| acc[0] = vmlal_lane_s16(acc[0], filter, input, 0); |
| acc[1] = vmlal_lane_s16(acc[1], filter, input, 1); |
| acc[2] = vmlal_lane_s16(acc[2], filter, input, 2); |
| acc[3] = vmlal_lane_s16(acc[3], filter, input, 3); |
| |
| // Store the accumulators back to acc_buffer |
| for (int i = 0; i < 4; i++) { |
| vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); |
| } |
| acc_buffer_ptr += 16; |
| } |
| // Handle one output pixel at a time. |
| for (; outp < num_output_pixels; outp++) { |
| // Load the accumulators from acc_buffer |
| int32x4_t acc = vld1q_s32(acc_buffer_ptr); |
| |
| // Load the inputs, add input_offset. |
| const uint32 input = *input_ptr++ + input_offset; |
| |
| // Multiply-accumulate |
| acc = vmlal_n_s16(acc, filter, input); |
| // Store the accumulators back to acc_buffer |
| vst1q_s32(acc_buffer_ptr, acc); |
| acc_buffer_ptr += 4; |
| } |
| } |
| }; |
| |
| template <> |
| struct QuantizedDepthwiseConvKernel<false, 4, 1> { |
| static void Run(int num_output_pixels, int input_depth, int depth_multiplier, |
| const uint8* input_ptr, int16 input_offset, |
| int input_ptr_increment, const uint8* filter_ptr, |
| int16 filter_offset, int32* acc_buffer_ptr) { |
| // Load the filters, add filter_offset. |
| uint8x8_t filter_u8 = vdup_n_u8(0); |
| filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0); |
| filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1); |
| filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2); |
| filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3); |
| const int16x4_t filter_s16 = |
| vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8))); |
| const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset)); |
| |
| int outp = 0; |
| // Handle 4 output pixels at a time. |
| for (; outp <= num_output_pixels - 4; outp += 4) { |
| // Load the accumulators from acc_buffer |
| int32x4_t acc[4]; |
| for (int i = 0; i < 4; i++) { |
| acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); |
| } |
| // Load the inputs, add input_offset. |
| int16x8_t input[2]; |
| for (int i = 0; i < 2; i++) { |
| const uint8x8_t input_u8 = vld1_u8(input_ptr + 8 * i); |
| const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); |
| input[i] = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); |
| } |
| input_ptr += 16; |
| // Multiply-accumulate |
| for (int i = 0; i < 2; i++) { |
| acc[2 * i + 0] = |
| vmlal_s16(acc[2 * i + 0], filter, vget_low_s16(input[i])); |
| acc[2 * i + 1] = |
| vmlal_s16(acc[2 * i + 1], filter, vget_high_s16(input[i])); |
| } |
| // Store the accumulators back to acc_buffer |
| for (int i = 0; i < 4; i++) { |
| vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); |
| } |
| acc_buffer_ptr += 16; |
| } |
| // Handle one output pixel at a time. |
| for (; outp < num_output_pixels; outp++) { |
| // Load the accumulators from acc_buffer |
| int32x4_t acc; |
| acc = vld1q_s32(acc_buffer_ptr); |
| |
| // Load the inputs, add input_offset. |
| uint8x8_t input_u8 = vdup_n_u8(0); |
| input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); |
| input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); |
| input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); |
| input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); |
| input_ptr += 4; |
| const int16x4_t input_s16 = |
| vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); |
| const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); |
| // Multiply-accumulate |
| acc = vmlal_s16(acc, filter, input); |
| // Store the accumulators back to acc_buffer |
| vst1q_s32(acc_buffer_ptr, acc); |
| acc_buffer_ptr += 4; |
| } |
| } |
| }; |
| |
| template <> |
| struct QuantizedDepthwiseConvKernel<false, 4, 4> { |
| static void Run(int num_output_pixels, int input_depth, int depth_multiplier, |
| const uint8* input_ptr, int16 input_offset, |
| int input_ptr_increment, const uint8* filter_ptr, |
| int16 filter_offset, int32* acc_buffer_ptr) { |
| // Load the filters, add filter_offset. |
| int16x8_t filter[2]; |
| for (int i = 0; i < 2; i++) { |
| const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i); |
| const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8)); |
| filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); |
| } |
| |
| int outp = 0; |
| // Handle 2 output pixels at a time. |
| for (; outp <= num_output_pixels - 2; outp += 2) { |
| // Load the accumulators from acc_buffer |
| int32x4_t acc[8]; |
| for (int i = 0; i < 8; i++) { |
| acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); |
| } |
| |
| // Load the inputs, add input_offset. |
| uint8x8_t input_u8 = vld1_u8(input_ptr); |
| input_ptr += 8; |
| const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); |
| const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); |
| |
| // Multiply-accumulate |
| acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), |
| vget_low_s16(input), 0); |
| acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), |
| vget_low_s16(input), 1); |
| acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), |
| vget_low_s16(input), 2); |
| acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), |
| vget_low_s16(input), 3); |
| acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), |
| vget_high_s16(input), 0); |
| acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), |
| vget_high_s16(input), 1); |
| acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), |
| vget_high_s16(input), 2); |
| acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), |
| vget_high_s16(input), 3); |
| // Store the accumulators back to acc_buffer |
| for (int i = 0; i < 8; i++) { |
| vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); |
| } |
| acc_buffer_ptr += 32; |
| } |
| // Handle one output pixel at a time. |
| for (; outp < num_output_pixels; outp++) { |
| // Load the accumulators from acc_buffer |
| int32x4_t acc[4]; |
| for (int i = 0; i < 4; i++) { |
| acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); |
| } |
| |
| // Load the inputs, add input_offset. |
| uint8x8_t input_u8 = vdup_n_u8(0); |
| input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); |
| input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); |
| input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); |
| input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); |
| input_ptr += 4; |
| const int16x4_t input_s16 = |
| vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); |
| const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); |
| |
| // Multiply-accumulate |
| acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0); |
| acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 1); |
| acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 2); |
| acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 3); |
| // Store the accumulators back to acc_buffer |
| for (int i = 0; i < 4; i++) { |
| vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); |
| } |
| acc_buffer_ptr += 16; |
| } |
| } |
| }; |
| |
| template <> |
| struct QuantizedDepthwiseConvKernel<true, 0, 3> { |
| static void Run(int num_output_pixels, int input_depth, int depth_multiplier, |
| const uint8* input_ptr, int16 input_offset, |
| int input_ptr_increment, const uint8* filter_ptr, |
| int16 filter_offset, int32* acc_buffer_ptr) { |
| // We will have to duplicate bytes in a NEON register, 3-fold. |
| // We will do that by register-level table-look-up using VTBL instructions. |
| // Here we prepare the registers containing the table-lookup indices. |
| static const uint8 dup3_indices_array[3][8] = {{0, 0, 0, 1, 1, 1, 2, 2}, |
| {2, 3, 3, 3, 4, 4, 4, 5}, |
| {5, 5, 6, 6, 6, 7, 7, 7}}; |
| uint8x8_t dup3_indices[3]; |
| for (int i = 0; i < 3; i++) { |
| dup3_indices[i] = vld1_u8(dup3_indices_array[i]); |
| } |
| |
| // Handle one output pixel at a time. |
| for (int outp = 0; outp < num_output_pixels; outp++) { |
| const uint8* local_filter_ptr = filter_ptr; |
| const uint8* local_input_ptr = input_ptr; |
| int ic = 0; |
| // Handle 8 input channels at a time. |
| for (; ic <= input_depth - 8; ic += 8) { |
| // Load the filters, add filter_offset. |
| int16x8_t filter[3]; |
| uint8x8x3_t filter_u8; |
| filter_u8.val[0] = vld1_u8(local_filter_ptr); |
| filter_u8.val[1] = vld1_u8(local_filter_ptr + 8); |
| filter_u8.val[2] = vld1_u8(local_filter_ptr + 16); |
| local_filter_ptr += 24; |
| for (int i = 0; i < 3; i++) { |
| const int16x8_t filter_s16 = |
| vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])); |
| filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); |
| } |
| // Load the inputs, duplicate 3-fold, add input_offset. |
| const uint8x8_t input_u8 = vld1_u8(local_input_ptr); |
| local_input_ptr += 8; |
| |
| uint8x8_t input_u8_dup3[3]; |
| for (int i = 0; i < 3; i++) { |
| input_u8_dup3[i] = vtbl1_u8(input_u8, dup3_indices[i]); |
| } |
| int16x8_t input_dup3[3]; |
| for (int i = 0; i < 3; i++) { |
| const int16x8_t input_s16_dup3 = |
| vreinterpretq_s16_u16(vmovl_u8(input_u8_dup3[i])); |
| input_dup3[i] = vaddq_s16(input_s16_dup3, vdupq_n_s16(input_offset)); |
| } |
| // Load the accumulators from acc_buffer |
| int32x4x3_t acc[2]; |
| for (int i = 0; i < 2; i++) { |
| acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i); |
| acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8); |
| acc[i].val[2] = vld1q_s32(acc_buffer_ptr + 4 * i + 16); |
| } |
| // Multiply-accumulate |
| for (int j = 0; j < 3; j++) { |
| acc[0].val[j] = vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]), |
| vget_low_s16(filter[j])); |
| acc[1].val[j] = vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]), |
| vget_high_s16(filter[j])); |
| } |
| // Store the accumulators back to acc_buffer |
| for (int i = 0; i < 2; i++) { |
| vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]); |
| vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]); |
| vst1q_s32(acc_buffer_ptr + 4 * i + 16, acc[i].val[2]); |
| } |
| acc_buffer_ptr += 24; |
| } |
| // Handle one input channel at a time. |
| for (; ic < input_depth; ic++) { |
| const int16 input_val = *local_input_ptr++ + input_offset; |
| for (int i = 0; i < 3; i++) { |
| const int16 filter_val = local_filter_ptr[i] + filter_offset; |
| *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val; |
| } |
| local_filter_ptr += 3; |
| } |
| input_ptr += input_ptr_increment; |
| } |
| } |
| }; |
| |
| template <> |
| struct QuantizedDepthwiseConvKernel<true, 0, 2> { |
| static void Run(int num_output_pixels, int input_depth, int depth_multiplier, |
| const uint8* input_ptr, int16 input_offset, |
| int input_ptr_increment, const uint8* filter_ptr, |
| int16 filter_offset, int32* acc_buffer_ptr) { |
| // Handle one output pixel at a time. |
| for (int outp = 0; outp < num_output_pixels; outp++) { |
| const uint8* local_filter_ptr = filter_ptr; |
| const uint8* local_input_ptr = input_ptr; |
| int ic = 0; |
| // Handle 8 input channels at a time. |
| for (; ic <= input_depth - 8; ic += 8) { |
| // Load the filters, add filter_offset. |
| int16x8_t filter[2]; |
| uint8x8x2_t filter_u8; |
| filter_u8.val[0] = vld1_u8(local_filter_ptr); |
| filter_u8.val[1] = vld1_u8(local_filter_ptr + 8); |
| local_filter_ptr += 16; |
| for (int i = 0; i < 2; i++) { |
| const int16x8_t filter_s16 = |
| vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])); |
| filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); |
| } |
| // Load the inputs, add input_offset, duplicate 2-fold. |
| const uint8x8_t input_u8 = vld1_u8(local_input_ptr); |
| local_input_ptr += 8; |
| const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); |
| const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); |
| const int16x8x2_t input_dup2 = vzipq_s16(input, input); |
| // Load the accumulators from acc_buffer. |
| int32x4x2_t acc[2]; |
| for (int i = 0; i < 2; i++) { |
| acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i); |
| acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8); |
| } |
| // Multiply-accumulate. |
| for (int j = 0; j < 2; j++) { |
| acc[0].val[j] = vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]), |
| vget_low_s16(input_dup2.val[j])); |
| acc[1].val[j] = vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]), |
| vget_high_s16(input_dup2.val[j])); |
| } |
| // Store the accumulators back to acc_buffer. |
| for (int i = 0; i < 2; i++) { |
| vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]); |
| vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]); |
| } |
| acc_buffer_ptr += 16; |
| } |
| // Handle one input channel at a time. |
| for (; ic < input_depth; ic++) { |
| // Load the inputs. |
| const int16 input_val = *local_input_ptr++ + input_offset; |
| for (int i = 0; i < 2; i++) { |
| const int16 filter_val = local_filter_ptr[i] + filter_offset; |
| *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val; |
| } |
| local_filter_ptr += 2; |
| } |
| input_ptr += input_ptr_increment; |
| } |
| } |
| }; |
| |
| template <> |
| struct QuantizedDepthwiseConvKernel<true, 0, 1> { |
| static void Run(int num_output_pixels, int input_depth, int depth_multiplier, |
| const uint8* input_ptr, int16 input_offset, |
| int input_ptr_increment, const uint8* filter_ptr, |
| int16 filter_offset, int32* acc_buffer_ptr) { |
| // Handle one output pixel at a time. |
| for (int outp = 0; outp < num_output_pixels; outp++) { |
| const uint8* local_filter_ptr = filter_ptr; |
| const uint8* local_input_ptr = input_ptr; |
| int ic = 0; |
| // Handle 16 input channels at a time. |
| for (; ic <= input_depth - 16; ic += 16) { |
| // Load the filters, add filter_offset. |
| uint8x8_t filter_u8_0 = vld1_u8(local_filter_ptr + 8 * 0); |
| uint8x8_t filter_u8_1 = vld1_u8(local_filter_ptr + 8 * 1); |
| local_filter_ptr += 16; |
| int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0)); |
| int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1)); |
| filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset)); |
| filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset)); |
| // Load the inputs, add input_offset. |
| uint8x8_t input_u8_0 = vld1_u8(local_input_ptr + 8 * 0); |
| uint8x8_t input_u8_1 = vld1_u8(local_input_ptr + 8 * 1); |
| local_input_ptr += 16; |
| int16x8_t input_0 = vreinterpretq_s16_u16(vmovl_u8(input_u8_0)); |
| int16x8_t input_1 = vreinterpretq_s16_u16(vmovl_u8(input_u8_1)); |
| input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset)); |
| input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset)); |
| // Load the accumulators from acc_buffer |
| int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0); |
| int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1); |
| int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2); |
| int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3); |
| acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), vget_low_s16(filter_0)); |
| acc_1 = |
| vmlal_s16(acc_1, vget_high_s16(input_0), vget_high_s16(filter_0)); |
| acc_2 = vmlal_s16(acc_2, vget_low_s16(input_1), vget_low_s16(filter_1)); |
| acc_3 = |
| vmlal_s16(acc_3, vget_high_s16(input_1), vget_high_s16(filter_1)); |
| // Store the accumulators back to acc_buffer |
| vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0); |
| vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1); |
| vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2); |
| vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3); |
| acc_buffer_ptr += 16; |
| } |
| // Handle 8 input channels at a time. |
| for (; ic <= input_depth - 8; ic += 8) { |
| // Load the filters, add filter_offset. |
| const uint8x8_t filter_u8 = vld1_u8(local_filter_ptr); |
| local_filter_ptr += 8; |
| const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8)); |
| const int16x8_t filter = |
| vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); |
| // Load the inputs, add input_offset. |
| const uint8x8_t input_u8 = vld1_u8(local_input_ptr); |
| local_input_ptr += 8; |
| const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); |
| const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); |
| // Load the accumulators from acc_buffer |
| int32x4_t acc[2]; |
| for (int i = 0; i < 2; i++) { |
| acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); |
| } |
| // Multiply-accumulate |
| acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter)); |
| acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter)); |
| // Store the accumulators back to acc_buffer |
| for (int i = 0; i < 2; i++) { |
| vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); |
| } |
| acc_buffer_ptr += 8; |
| } |
| // Handle one input channel at a time. |
| for (; ic < input_depth; ic++) { |
| const int16 input_val = *local_input_ptr++ + input_offset; |
| const int16 filter_val = *local_filter_ptr++ + filter_offset; |
| *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val; |
| } |
| input_ptr += input_ptr_increment; |
| } |
| } |
| }; |
| |
| template <> |
| struct QuantizedDepthwiseConvKernel<true, 16, 1> { |
| static void Run(int num_output_pixels, int input_depth, int depth_multiplier, |
| const uint8* input_ptr, int16 input_offset, |
| int input_ptr_increment, const uint8* filter_ptr, |
| int16 filter_offset, int32* acc_buffer_ptr) { |
| // Load the filters, add filter_offset. |
| uint8x8_t filter_u8[2]; |
| for (int i = 0; i < 2; i++) { |
| filter_u8[i] = vld1_u8(filter_ptr + 8 * i); |
| } |
| int16x8_t filter[2]; |
| for (int i = 0; i < 2; i++) { |
| filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i])); |
| } |
| for (int i = 0; i < 2; i++) { |
| filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset)); |
| } |
| // Handle one output pixel at a time. |
| for (int outp = 0; outp < num_output_pixels; outp++) { |
| // Load the inputs, add input_offset. |
| uint8x8_t input_u8[2]; |
| for (int i = 0; i < 2; i++) { |
| input_u8[i] = vld1_u8(input_ptr + 8 * i); |
| } |
| input_ptr += input_ptr_increment; |
| int16x8_t input[2]; |
| for (int i = 0; i < 2; i++) { |
| input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i])); |
| } |
| for (int i = 0; i < 2; i++) { |
| input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset)); |
| } |
| // Load the accumulators from acc_buffer |
| int32x4_t acc[4]; |
| for (int i = 0; i < 4; i++) { |
| acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); |
| } |
| // Multiply-accumulate |
| for (int i = 0; i < 2; i++) { |
| acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]), |
| vget_low_s16(filter[i])); |
| acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]), |
| vget_high_s16(filter[i])); |
| } |
| // Store the accumulators back to acc_buffer |
| for (int i = 0; i < 4; i++) { |
| vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); |
| } |
| acc_buffer_ptr += 16; |
| } |
| } |
| }; |
| |
| template <> |
| struct QuantizedDepthwiseConvKernel<true, 8, 1> { |
| static void Run(int num_output_pixels, int input_depth, int depth_multiplier, |
| const uint8* input_ptr, int16 input_offset, |
| int input_ptr_increment, const uint8* filter_ptr, |
| int16 filter_offset, int32* acc_buffer_ptr) { |
| // Load the filters, add filter_offset. |
| const uint8x8_t filter_u8 = vld1_u8(filter_ptr); |
| const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8)); |
| const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); |
| // Handle one output pixel at a time. |
| for (int outp = 0; outp < num_output_pixels; outp++) { |
| // Load the inputs, add input_offset. |
| const uint8x8_t input_u8 = vld1_u8(input_ptr); |
| const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); |
| const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); |
| // Load the accumulators from acc_buffer |
| int32x4_t acc[2]; |
| for (int i = 0; i < 2; i++) { |
| acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); |
| } |
| // Multiply-accumulate |
| acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter)); |
| acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter)); |
| // Store the accumulators back to acc_buffer |
| for (int i = 0; i < 2; i++) { |
| vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); |
| } |
| acc_buffer_ptr += 8; |
| input_ptr += input_ptr_increment; |
| } |
| } |
| }; |
| |
| template <> |
| struct QuantizedDepthwiseConvKernel<true, 1, 16> { |
| static void Run(int num_output_pixels, int input_depth, int depth_multiplier, |
| const uint8* input_ptr, int16 input_offset, |
| int input_ptr_increment, const uint8* filter_ptr, |
| int16 filter_offset, int32* acc_buffer_ptr) { |
| // Load the filters, add filter_offset. |
| uint8x8_t filter_u8[2]; |
| for (int i = 0; i < 2; i++) { |
| filter_u8[i] = vld1_u8(filter_ptr + 8 * i); |
| } |
| int16x8_t filter[2]; |
| for (int i = 0; i < 2; i++) { |
| filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i])); |
| } |
| for (int i = 0; i < 2; i++) { |
| filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset)); |
| } |
| // Handle one output pixel at a time. |
| for (int outp = 0; outp < num_output_pixels; outp++) { |
| uint8 input_u8 = *input_ptr; |
| input_ptr += input_ptr_increment; |
| int16 input = static_cast<int16>(input_u8 + input_offset); |
| // Load the accumulators from acc_buffer |
| int32x4_t acc[4]; |
| for (int i = 0; i < 4; i++) { |
| acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); |
| } |
| // Multiply-accumulate |
| for (int i = 0; i < 2; i++) { |
| acc[2 * i + 0] = |
| vmlal_n_s16(acc[2 * i + 0], vget_low_s16(filter[i]), input); |
| acc[2 * i + 1] = |
| vmlal_n_s16(acc[2 * i + 1], vget_high_s16(filter[i]), input); |
| } |
| // Store the accumulators back to acc_buffer |
| for (int i = 0; i < 4; i++) { |
| vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); |
| } |
| acc_buffer_ptr += 16; |
| } |
| } |
| }; |
| |
| template <> |
| struct QuantizedDepthwiseConvKernel<true, 1, 32> { |
| static void Run(int num_output_pixels, int input_depth, int depth_multiplier, |
| const uint8* input_ptr, int16 input_offset, |
| int input_ptr_increment, const uint8* filter_ptr, |
| int16 filter_offset, int32* acc_buffer_ptr) { |
| // Load the filters, add filter_offset. |
| uint8x8_t filter_u8_0 = vld1_u8(filter_ptr + 8 * 0); |
| uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 8 * 1); |
| uint8x8_t filter_u8_2 = vld1_u8(filter_ptr + 8 * 2); |
| uint8x8_t filter_u8_3 = vld1_u8(filter_ptr + 8 * 3); |
| int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0)); |
| int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1)); |
| int16x8_t filter_2 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_2)); |
| int16x8_t filter_3 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_3)); |
| filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset)); |
| filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset)); |
| filter_2 = vaddq_s16(filter_2, vdupq_n_s16(filter_offset)); |
| filter_3 = vaddq_s16(filter_3, vdupq_n_s16(filter_offset)); |
| // Handle one output pixel at a time. |
| for (int outp = 0; outp < num_output_pixels; outp++) { |
| uint8 input_u8 = *input_ptr; |
| input_ptr += input_ptr_increment; |
| int16 input = static_cast<int16>(input_u8 + input_offset); |
| // Load the accumulators from acc_buffer |
| int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0); |
| int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1); |
| int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2); |
| int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3); |
| int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4); |
| int32x4_t acc_5 = vld1q_s32(acc_buffer_ptr + 4 * 5); |
| int32x4_t acc_6 = vld1q_s32(acc_buffer_ptr + 4 * 6); |
| int32x4_t acc_7 = vld1q_s32(acc_buffer_ptr + 4 * 7); |
| // Multiply-accumulate |
| acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input); |
| acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input); |
| acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input); |
| acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input); |
| acc_4 = vmlal_n_s16(acc_4, vget_low_s16(filter_2), input); |
| acc_5 = vmlal_n_s16(acc_5, vget_high_s16(filter_2), input); |
| acc_6 = vmlal_n_s16(acc_6, vget_low_s16(filter_3), input); |
| acc_7 = vmlal_n_s16(acc_7, vget_high_s16(filter_3), input); |
| // Store the accumulators back to acc_buffer |
| vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0); |
| vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1); |
| vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2); |
| vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3); |
| vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4); |
| vst1q_s32(acc_buffer_ptr + 4 * 5, acc_5); |
| vst1q_s32(acc_buffer_ptr + 4 * 6, acc_6); |
| vst1q_s32(acc_buffer_ptr + 4 * 7, acc_7); |
| acc_buffer_ptr += 32; |
| } |
| } |
| }; |
| |
| template <> |
| struct QuantizedDepthwiseConvKernel<true, 1, 20> { |
| static void Run(int num_output_pixels, int input_depth, int depth_multiplier, |
| const uint8* input_ptr, int16 input_offset, |
| int input_ptr_increment, const uint8* filter_ptr, |
| int16 filter_offset, int32* acc_buffer_ptr) { |
| // Load the filters, add filter_offset. |
| // NEON wants to load 8 bytes at a time, but 20 is not divisible by 8. |
| // We load the first 16 bytes into filter_u8_{0,1} as usual. |
| // Then we load the 8 last bytes into filter_u8_x (x for 'extra'). |
| // This is redundant: the first 4 bytes of filter_u8_x are the same |
| // as the last 4 bytes of filter_u8_x. |
| uint8x8_t filter_u8_0 = vld1_u8(filter_ptr + 8 * 0); |
| uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 8 * 1); |
| uint8x8_t filter_u8_x = vld1_u8(filter_ptr + 8 * 1 + 4); |
| int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0)); |
| int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1)); |
| int16x8_t filter_x = vreinterpretq_s16_u16(vmovl_u8(filter_u8_x)); |
| filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset)); |
| filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset)); |
| filter_x = vaddq_s16(filter_x, vdupq_n_s16(filter_offset)); |
| // Handle one output pixel at a time. |
| for (int outp = 0; outp < num_output_pixels; outp++) { |
| uint8 input_u8 = *input_ptr; |
| input_ptr += input_ptr_increment; |
| int16 input = static_cast<int16>(input_u8 + input_offset); |
| // Load the accumulators from acc_buffer |
| int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0); |
| int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1); |
| int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2); |
| int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3); |
| int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4); |
| // Multiply-accumulate |
| acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input); |
| acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input); |
| acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input); |
| acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input); |
| acc_4 = vmlal_n_s16(acc_4, vget_high_s16(filter_x), input); |
| // Store the accumulators back to acc_buffer |
| vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0); |
| vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1); |
| vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2); |
| vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3); |
| vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4); |
| acc_buffer_ptr += 20; |
| } |
| } |
| }; |
| |
| template <> |
| struct QuantizedDepthwiseConvKernel<true, 1, 8> { |
| static void Run(int num_output_pixels, int input_depth, int depth_multiplier, |
| const uint8* input_ptr, int16 input_offset, |
| int input_ptr_increment, const uint8* filter_ptr, |
| int16 filter_offset, int32* acc_buffer_ptr) { |
| // Load the filters, add filter_offset. |
| const uint8x8_t filter_u8 = vld1_u8(filter_ptr); |
| const int16x8_t filter = vaddq_s16( |
| vreinterpretq_s16_u16(vmovl_u8(filter_u8)), vdupq_n_s16(filter_offset)); |
| // Handle one output pixel at a time. |
| for (int outp = 0; outp < num_output_pixels; outp++) { |
| uint8 input_u8 = *input_ptr; |
| input_ptr += input_ptr_increment; |
| int16 input = static_cast<int16>(input_u8 + input_offset); |
| // Load the accumulators from acc_buffer |
| int32x4_t acc[2]; |
| for (int i = 0; i < 2; i++) { |
| acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); |
| } |
| // Multiply-accumulate |
| acc[0] = vmlal_n_s16(acc[0], vget_low_s16(filter), input); |
| acc[1] = vmlal_n_s16(acc[1], vget_high_s16(filter), input); |
| // Store the accumulators back to acc_buffer |
| for (int i = 0; i < 2; i++) { |
| vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); |
| } |
| acc_buffer_ptr += 8; |
| } |
| } |
| }; |
| |
| template <> |
| struct QuantizedDepthwiseConvKernel<true, 2, 1> { |
| static void Run(int num_output_pixels, int input_depth, int depth_multiplier, |
| const uint8* input_ptr, int16 input_offset, |
| int input_ptr_increment, const uint8* filter_ptr, |
| int16 filter_offset, int32* acc_buffer_ptr) { |
| // Load the filters, add filter_offset. |
| uint8x8_t filter_u8 = vdup_n_u8(0); |
| filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0); |
| filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1); |
| filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2); |
| filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3); |
| const int16x4_t filter_s16 = |
| vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8))); |
| const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset)); |
| |
| int outp = 0; |
| |
| // Handle 2 output pixels at a time. |
| for (; outp <= num_output_pixels - 2; outp += 2) { |
| // Load the accumulators from acc_buffer. |
| int32x4_t acc = vld1q_s32(acc_buffer_ptr); |
| // Load the inputs, add input_offset. |
| uint16x4_t input_u16 = vdup_n_u16(0); |
| input_u16 = vset_lane_u16((reinterpret_cast<const uint16*>(input_ptr))[0], |
| input_u16, 0); |
| input_ptr += input_ptr_increment; |
| input_u16 = vset_lane_u16((reinterpret_cast<const uint16*>(input_ptr))[0], |
| input_u16, 1); |
| input_ptr += input_ptr_increment; |
| const int16x4_t input_s16 = vreinterpret_s16_u16( |
| vget_low_u16(vmovl_u8(vreinterpret_u8_u16(input_u16)))); |
| const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); |
| |
| // Multiply-accumulate. |
| acc = vmlal_s16(acc, filter, input); |
| // Store the accumulators back to acc_buffer. |
| vst1q_s32(acc_buffer_ptr, acc); |
| acc_buffer_ptr += 4; |
| } |
| |
| // Handle 1 output pixel at a time. |
| for (; outp < num_output_pixels; outp++) { |
| // Load the accumulators from acc_buffer. |
| int32x2_t acc = vld1_s32(acc_buffer_ptr); |
| // Load the inputs, add input_offset. |
| uint8x8_t input_u8 = vdup_n_u8(0); |
| input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); |
| input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); |
| input_ptr += input_ptr_increment; |
| const int16x4_t input_s16 = |
| vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); |
| const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); |
| |
| // Multiply-accumulate. |
| acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input)); |
| // Store the accumulators back to acc_buffer. |
| vst1_s32(acc_buffer_ptr, acc); |
| acc_buffer_ptr += 2; |
| } |
| } |
| }; |
| |
| template <> |
| struct QuantizedDepthwiseConvKernel<true, 4, 1> { |
| static void Run(int num_output_pixels, int input_depth, int depth_multiplier, |
| const uint8* input_ptr, int16 input_offset, |
| int input_ptr_increment, const uint8* filter_ptr, |
| int16 filter_offset, int32* acc_buffer_ptr) { |
| if (num_output_pixels <= 0) { |
| return; |
| } |
| |
| // Load the filters, add filter_offset. |
| uint8x8_t filter_u8 = vdup_n_u8(0); |
| filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0); |
| filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1); |
| filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2); |
| filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3); |
| const int16x4_t filter_s16 = |
| vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8))); |
| const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset)); |
| |
| int outp = 0; |
| |
| // Handle one output pixel at a time until second to the last pixel. Second |
| // to the last because we read eight input pixels while only processing |
| // four. |
| for (; outp < num_output_pixels - 1; outp++) { |
| // Load the accumulators from acc_buffer |
| int32x4_t acc; |
| acc = vld1q_s32(acc_buffer_ptr); |
| |
| // Load the inputs, add input_offset. |
| uint8x8_t input_u8 = vld1_u8(input_ptr); |
| input_ptr += input_ptr_increment; |
| const int16x4_t input_s16 = |
| vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); |
| const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); |
| // Multiply-accumulate |
| acc = vmlal_s16(acc, filter, input); |
| // Store the accumulators back to acc_buffer |
| vst1q_s32(acc_buffer_ptr, acc); |
| acc_buffer_ptr += 4; |
| } |
| |
| // Handle the last output pixel. |
| // Load the accumulators from acc_buffer |
| int32x4_t acc; |
| acc = vld1q_s32(acc_buffer_ptr); |
| |
| // Load the inputs, add input_offset. |
| uint8x8_t input_u8 = vdup_n_u8(0); |
| input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); |
| input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); |
| input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); |
| input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); |
| const int16x4_t input_s16 = |
| vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); |
| const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); |
| // Multiply-accumulate |
| acc = vmlal_s16(acc, filter, input); |
| // Store the accumulators back to acc_buffer |
| vst1q_s32(acc_buffer_ptr, acc); |
| } |
| }; |
| |
| template <> |
| struct QuantizedDepthwiseConvKernel<false, 12, 1> { |
| static void Run(int num_output_pixels, int input_depth, int depth_multiplier, |
| const uint8* input_ptr, int16 input_offset, |
| int input_ptr_increment, const uint8* filter_ptr, |
| int16 filter_offset, int32* acc_buffer_ptr) { |
| // Load the filters, add filter_offset. |
| uint8x8_t filter_u8_0 = vld1_u8(filter_ptr); |
| uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 4); |
| int16x8_t filter_s16_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0)); |
| int16x8_t filter_s16_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1)); |
| filter_s16_0 = vaddq_s16(filter_s16_0, vdupq_n_s16(filter_offset)); |
| filter_s16_1 = vaddq_s16(filter_s16_1, vdupq_n_s16(filter_offset)); |
| int16x4_t filter_0 = vget_low_s16(filter_s16_0); |
| int16x4_t filter_1 = vget_high_s16(filter_s16_0); |
| int16x4_t filter_2 = vget_high_s16(filter_s16_1); |
| |
| // Handle one output pixel at a time. |
| for (int outp = 0; outp < num_output_pixels; outp++) { |
| // Load the inputs, add input_offset. |
| uint8x8_t input_u8_0 = vld1_u8(input_ptr); |
| uint8x8_t input_u8_1 = vld1_u8(input_ptr + 4); |
| input_ptr += input_ptr_increment; |
| int16x8_t input_0 = vreinterpretq_s16_u16(vmovl_u8(input_u8_0)); |
| int16x8_t input_1 = vreinterpretq_s16_u16(vmovl_u8(input_u8_1)); |
| input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset)); |
| input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset)); |
| |
| // Load the accumulators from acc_buffer |
| int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0); |
| int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1); |
| int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2); |
| |
| // Multiply-accumulate |
| acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), filter_0); |
| acc_1 = vmlal_s16(acc_1, vget_high_s16(input_0), filter_1); |
| acc_2 = vmlal_s16(acc_2, vget_high_s16(input_1), filter_2); |
| |
| // Store the accumulators back to acc_buffer |
| vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0); |
| vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1); |
| vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2); |
| |
| acc_buffer_ptr += 12; |
| } |
| } |
| }; |
| #endif |
| |
| // Accumulates the effect of one row of the filter, on a segment of one row |
| // of the output, accessing the corresponding one row of the input. |
| template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier> |
| void QuantizedDepthwiseConvAccumRow( |
| int stride, int input_depth, int input_width, const uint8* input_data, |
| int16 input_offset, int pad_width, int depth_multiplier, int filter_width, |
| const uint8* filter_data, int16 filter_offset, int out_x_buffer_start, |
| int out_x_buffer_end, int output_depth, int32* acc_buffer) { |
| #ifdef GEMMLOWP_PROFILING |
| gemmlowp::ScopedProfilingLabel label(__PRETTY_FUNCTION__); |
| #endif |
| // Sanity check parameters. This is important in particular to ensure |
| // that we keep the number of template instantiations minimal, so we don't |
| // increase binary size unnecessarily. |
| static_assert(kFixedDepthMultiplier || !kFixedInputDepth, ""); |
| static_assert(kFixedInputDepth || kAllowStrided, ""); |
| TFLITE_DCHECK(stride == 1 || kAllowStrided); |
| if (kFixedInputDepth) { |
| TFLITE_DCHECK_EQ(input_depth, kFixedInputDepth); |
| } |
| if (kFixedDepthMultiplier) { |
| TFLITE_DCHECK_EQ(depth_multiplier, kFixedDepthMultiplier); |
| } |
| TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier); |
| const int input_ptr_increment = stride * input_depth; |
| const uint8* filter_base_ptr = filter_data; |
| for (int filter_x = 0; filter_x < filter_width; ++filter_x) { |
| // For the current (filter_x, filter_y) point in the filter, |
| // compute the boundaries of the corresponding output row segment. |
| int out_x_loop_start_unclampled = 0; |
| int out_x_loop_end_unclampled = 0; |
| if (kAllowStrided) { |
| if (stride == 2) { |
| out_x_loop_start_unclampled = (pad_width - filter_x + 1) / 2; |
| out_x_loop_end_unclampled = |
| (pad_width + input_width - filter_x + 1) / 2; |
| } else if (stride == 4) { |
| out_x_loop_start_unclampled = (pad_width - filter_x + 3) / 4; |
| out_x_loop_end_unclampled = |
| (pad_width + input_width - filter_x + 3) / 4; |
| } else { |
| out_x_loop_start_unclampled = |
| (pad_width - filter_x + stride - 1) / stride; |
| out_x_loop_end_unclampled = |
| (pad_width + input_width - filter_x + stride - 1) / stride; |
| } |
| } else { |
| out_x_loop_start_unclampled = pad_width - filter_x; |
| out_x_loop_end_unclampled = pad_width + input_width - filter_x; |
| } |
| // The kernel will have to iterate on the segment of the |
| // output row that starts at out_x_loop_start and out_x_loop_end. |
| const int out_x_loop_start = |
| std::max(out_x_buffer_start, out_x_loop_start_unclampled); |
| const int out_x_loop_end = |
| std::min(out_x_buffer_end, out_x_loop_end_unclampled); |
| |
| int32* acc_buffer_ptr = |
| acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth; |
| const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x; |
| const uint8* input_ptr = input_data + in_x_origin * input_depth; |
| const int num_output_pixels = out_x_loop_end - out_x_loop_start; |
| QuantizedDepthwiseConvKernel< |
| kAllowStrided, kFixedInputDepth, |
| kFixedDepthMultiplier>::Run(num_output_pixels, input_depth, |
| depth_multiplier, input_ptr, input_offset, |
| input_ptr_increment, filter_base_ptr, |
| filter_offset, acc_buffer_ptr); |
| filter_base_ptr += output_depth; |
| } |
| } |
| |
| // generic fallback of DepthwiseConvAccumRow, portable, non-templatized. |
| inline void QuantizedDepthwiseConvAccumRowGeneric( |
| int stride, int input_depth, int input_width, const uint8* input_data, |
| int16 input_offset, int pad_width, int depth_multiplier, int filter_width, |
| const uint8* filter_data, int16 filter_offset, int out_x_buffer_start, |
| int out_x_buffer_end, int output_depth, int32* acc_buffer) { |
| gemmlowp::ScopedProfilingLabel label("DepthwiseConvAccumRowGeneric (slow)"); |
| #ifdef TFLITE_PREVENT_SLOW_GENERIC_DEPTHWISECONV_FALLBACK |
| #ifndef ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK |
| LOG(FATAL) |
| << "\n\n" |
| << "*****************************************************************\n" |
| << "* This tfmini inference code was about to use the slow generic\n" |
| << "* fallback implementation for a DepthwiseConv op, and we want you\n" |
| << "* to be aware of that so that you will know why you get terrible\n" |
| << "* performance.\n" |
| << "*\n" |
| << "* If you would like to carry on with the slow code, compile\n" |
| << "* with this preprocessor token defined:\n" |
| << "* ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK.\n" |
| << "*\n" |
| << "* The right thing to do, if you care about performance, is to add\n" |
| << "* a new DepthwiseConv kernel to tfmini to cover your case.\n" |
| << "* The relevant parameters defining your case are:\n" |
| << "* stride = " << stride << "\n" |
| << "* input_depth = " << input_depth << "\n" |
| << "* depth_multiplier = " << depth_multiplier << "\n" |
| << "*\n" |
| << "* Please do not hesitate to contact benoitjacob@ with this\n" |
| << "* information.\n" |
| << "*****************************************************************\n"; |
| #endif // ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK |
| #endif // TFLITE_PREVENT_SLOW_GENERIC_DEPTHWISECONV_FALLBACK |
| const uint8* filter_base_ptr = filter_data; |
| for (int filter_x = 0; filter_x < filter_width; ++filter_x) { |
| const int out_x_loop_start = std::max( |
| out_x_buffer_start, (pad_width - filter_x + stride - 1) / stride); |
| const int out_x_loop_end = |
| std::min(out_x_buffer_end, |
| (pad_width + input_width - filter_x + stride - 1) / stride); |
| |
| int32* acc_buffer_ptr = |
| acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth; |
| const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x; |
| const uint8* input_ptr = input_data + in_x_origin * input_depth; |
| const int input_ptr_increment = (stride - 1) * input_depth; |
| for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++) { |
| const uint8* filter_ptr = filter_base_ptr; |
| for (int ic = 0; ic < input_depth; ++ic) { |
| const int16 input_val = *input_ptr++ + input_offset; |
| for (int m = 0; m < depth_multiplier; m++) { |
| const int16 filter_val = *filter_ptr++ + filter_offset; |
| *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val; |
| } |
| } |
| input_ptr += input_ptr_increment; |
| } |
| filter_base_ptr += output_depth; |
| } |
| } |
| |
| // Initializes the accumulator buffer with bias values. |
| inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth, |
| const int32* bias_data, |
| int32* acc_buffer) { |
| int i = 0; |
| #ifdef USE_NEON |
| if (output_depth == 1) { |
| const int32x4_t b = vdupq_n_s32(bias_data[0]); |
| for (; i <= num_output_pixels - 16; i += 16) { |
| vst1q_s32(acc_buffer + i + 0, b); |
| vst1q_s32(acc_buffer + i + 4, b); |
| vst1q_s32(acc_buffer + i + 8, b); |
| vst1q_s32(acc_buffer + i + 12, b); |
| } |
| for (; i <= num_output_pixels - 4; i += 4) { |
| vst1q_s32(acc_buffer + i, b); |
| } |
| } else if (output_depth == 2) { |
| int32x4_t b = vdupq_n_s32(bias_data[0]); |
| b = vsetq_lane_s32(bias_data[1], b, 1); |
| b = vsetq_lane_s32(bias_data[1], b, 3); |
| for (; i <= num_output_pixels - 8; i += 8) { |
| vst1q_s32(acc_buffer + 2 * i + 0, b); |
| vst1q_s32(acc_buffer + 2 * i + 4, b); |
| vst1q_s32(acc_buffer + 2 * i + 8, b); |
| vst1q_s32(acc_buffer + 2 * i + 12, b); |
| } |
| for (; i <= num_output_pixels - 2; i += 2) { |
| vst1q_s32(acc_buffer + 2 * i, b); |
| } |
| } else if (output_depth == 4) { |
| const int32x4_t b = vld1q_s32(bias_data); |
| for (; i <= num_output_pixels - 4; i += 4) { |
| vst1q_s32(acc_buffer + 4 * i + 0, b); |
| vst1q_s32(acc_buffer + 4 * i + 4, b); |
| vst1q_s32(acc_buffer + 4 * i + 8, b); |
| vst1q_s32(acc_buffer + 4 * i + 12, b); |
| } |
| for (; i < num_output_pixels; i++) { |
| vst1q_s32(acc_buffer + 4 * i, b); |
| } |
| } else if (output_depth == 8) { |
| const int32x4_t b0 = vld1q_s32(bias_data); |
| const int32x4_t b1 = vld1q_s32(bias_data + 4); |
| for (; i <= num_output_pixels - 2; i += 2) { |
| vst1q_s32(acc_buffer + 8 * i + 0, b0); |
| vst1q_s32(acc_buffer + 8 * i + 4, b1); |
| vst1q_s32(acc_buffer + 8 * i + 8, b0); |
| vst1q_s32(acc_buffer + 8 * i + 12, b1); |
| } |
| for (; i < num_output_pixels; i++) { |
| vst1q_s32(acc_buffer + 8 * i + 0, b0); |
| vst1q_s32(acc_buffer + 8 * i + 4, b1); |
| } |
| } else if (output_depth == 16) { |
| const int32x4_t b0 = vld1q_s32(bias_data); |
| const int32x4_t b1 = vld1q_s32(bias_data + 4); |
| const int32x4_t b2 = vld1q_s32(bias_data + 8); |
| const int32x4_t b3 = vld1q_s32(bias_data + 12); |
| for (; i < num_output_pixels; i++) { |
| vst1q_s32(acc_buffer + 16 * i + 0, b0); |
| vst1q_s32(acc_buffer + 16 * i + 4, b1); |
| vst1q_s32(acc_buffer + 16 * i + 8, b2); |
| vst1q_s32(acc_buffer + 16 * i + 12, b3); |
| } |
| } |
| #endif |
| for (; i < num_output_pixels; i++) { |
| memcpy(acc_buffer + i * output_depth, bias_data, |
| sizeof(acc_buffer[0]) * output_depth); |
| } |
| } |
| |
| inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, |
| int32 input_offset, const uint8* filter_data, |
| const Dims<4>& filter_dims, int32 filter_offset, |
| const int32* bias_data, const Dims<4>& bias_dims, |
| int stride_width, int stride_height, int pad_width, |
| int pad_height, int depth_multiplier, |
| int32 output_offset, int32 output_multiplier, |
| int output_shift, int32 output_activation_min, |
| int32 output_activation_max, uint8* output_data, |
| const Dims<4>& output_dims) { |
| gemmlowp::ScopedProfilingLabel label("DepthwiseConv/8bit"); |
| TFLITE_DCHECK_LE(output_activation_min, output_activation_max); |
| |
| const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); |
| const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0); |
| const int input_height = ArraySize(input_dims, 2); |
| const int input_width = ArraySize(input_dims, 1); |
| const int input_depth = ArraySize(input_dims, 0); |
| const int filter_height = ArraySize(filter_dims, 2); |
| const int filter_width = ArraySize(filter_dims, 1); |
| const int output_height = ArraySize(output_dims, 2); |
| const int output_width = ArraySize(output_dims, 1); |
| TFLITE_DCHECK(output_depth == input_depth * depth_multiplier); |
| |
| static const int kAccBufferMaxSize = 2048; |
| int32 acc_buffer[kAccBufferMaxSize]; |
| TFLITE_DCHECK_GE(kAccBufferMaxSize, output_depth); |
| const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth; |
| const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth; |
| TFLITE_DCHECK_LE(kOutputPixelsInAccBuffer * output_depth, |
| kAccBufferActualSize); |
| TFLITE_DCHECK_LE(kAccBufferActualSize, kAccBufferMaxSize); |
| TFLITE_DCHECK_GE(kOutputPixelsInAccBuffer, 1); |
| |
| // row_accum_func will point to the core accumulation function to be used |
| // for this DepthwiseConv op. |
| using row_accum_func_t = decltype(&QuantizedDepthwiseConvAccumRowGeneric); |
| row_accum_func_t row_accum_func = nullptr; |
| |
| #define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, \ |
| FIXED_DEPTH_MULTIPLIER) \ |
| if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) && \ |
| (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) && \ |
| depth_multiplier == FIXED_DEPTH_MULTIPLIER) { \ |
| row_accum_func = \ |
| QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, \ |
| FIXED_DEPTH_MULTIPLIER>; \ |
| } |
| |
| #ifdef USE_NEON |
| // We go over our list of kernels by decreasing order of preference |
| // for the cases where multiple kernels could apply. |
| |
| // Start with the fastest kernels: AllowStrided=false, fixed input depth. |
| |
| TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 2) |
| TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 2) |
| TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 2) |
| TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 4) |
| TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 1) |
| TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 4) |
| TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1) |
| TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 8) |
| TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1) |
| TFMINI_USE_DEPTHWISECONV_KERNEL(false, 12, 1) |
| |
| // Next come the strided kernels: AllowStrided=true, fixed input depth. |
| // They are a bit less efficient, but allow stride!=1. |
| |
| TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 2) |
| TFMINI_USE_DEPTHWISECONV_KERNEL(true, 16, 1) |
| TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 16) |
| TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20) |
| TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32) |
| TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8) |
| TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1) |
| TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1) |
| TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1) |
| |
| // Finally, the kernels allowing a variable input depth, |
| // these are the least efficient but most general kernels. |
| |
| TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1) |
| TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2) |
| TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 3) |
| #endif // USE_NEON |
| |
| // No matching fast kernel found, use slow fallback. |
| if (!row_accum_func) { |
| row_accum_func = QuantizedDepthwiseConvAccumRowGeneric; |
| } |
| |
| #undef TFMINI_USE_DEPTHWISECONV_KERNEL |
| |
| // Now that we have determined row_accum_func, we can start work. |
| uint8* output_ptr = output_data; |
| for (int b = 0; b < batches; ++b) { |
| for (int out_y = 0; out_y < output_height; ++out_y) { |
| const int in_y_origin = (out_y * stride_height) - pad_height; |
| const int filter_y_start = std::max(0, -in_y_origin); |
| const int filter_y_end = |
| std::min(filter_height, input_height - in_y_origin); |
| for (int out_x_buffer_start = 0; out_x_buffer_start < output_width; |
| out_x_buffer_start += kOutputPixelsInAccBuffer) { |
| const int out_x_buffer_end = std::min( |
| output_width, out_x_buffer_start + kOutputPixelsInAccBuffer); |
| // We call a 'pixel' a group of activation that share all but the |
| // 'depth'/'channel' coordinate. num_output_pixels is the number of |
| // output pixels that we will accumulate in this loop iteration. |
| const int num_output_pixels = out_x_buffer_end - out_x_buffer_start; |
| // Initialize our local accumulator with the bias values, so we don't |
| // have to add them later. |
| DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data, |
| acc_buffer); |
| // Accumulation loop. Most of the time should be spent in here. |
| for (int filter_y = filter_y_start; filter_y < filter_y_end; |
| ++filter_y) { |
| const int in_y = in_y_origin + filter_y; |
| row_accum_func( |
| stride_width, input_depth, input_width, |
| input_data + in_y * input_dims.strides[2] + |
| b * input_dims.strides[3], |
| input_offset, pad_width, depth_multiplier, filter_width, |
| filter_data + filter_y * filter_dims.strides[2], filter_offset, |
| out_x_buffer_start, out_x_buffer_end, output_depth, acc_buffer); |
| } |
| // Finished accumulating int32 values. Now need to convert them to |
| // the final 8bit form and store them. |
| gemmlowp::ScopedProfilingLabel label("downquantize+store"); |
| const int num_output_values = output_depth * num_output_pixels; |
| int i = 0; |
| #ifdef USE_NEON |
| using gemmlowp::RoundingDivideByPOT; |
| const int32x4_t output_offset_vec = vdupq_n_s32(output_offset); |
| const int32x4_t output_activation_min_vec = |
| vdupq_n_s32(output_activation_min); |
| const int32x4_t output_activation_max_vec = |
| vdupq_n_s32(output_activation_max); |
| // Handle 16 values at once. |
| // This allows us to issue 4 mutually independent int32 |
| // multiplications (vqrdmulh), which should alleviate most of their |
| // high latency. |
| for (; i <= num_output_values - 16; i += 16) { |
| int32x4_t acc[4]; |
| for (int j = 0; j < 4; j++) { |
| acc[j] = vld1q_s32(acc_buffer + i + 4 * j); |
| } |
| |
| // Fixed-point multiplication. |
| for (int j = 0; j < 4; j++) { |
| acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier); |
| } |
| for (int j = 0; j < 4; j++) { |
| acc[j] = RoundingDivideByPOT(acc[j], output_shift); |
| } |
| // Add the output offset. |
| for (int j = 0; j < 4; j++) { |
| acc[j] = vaddq_s32(acc[j], output_offset_vec); |
| } |
| // Apply the activation function. |
| for (int j = 0; j < 4; j++) { |
| acc[j] = vmaxq_s32(acc[j], output_activation_min_vec); |
| } |
| for (int j = 0; j < 4; j++) { |
| acc[j] = vminq_s32(acc[j], output_activation_max_vec); |
| } |
| // Saturating cast to uint8 and store to destination. |
| int16x4_t acc_s16[4]; |
| for (int j = 0; j < 4; j++) { |
| acc_s16[j] = vqmovn_s32(acc[j]); |
| } |
| const int16x8_t res_s16_0 = vcombine_s16(acc_s16[0], acc_s16[1]); |
| const int16x8_t res_s16_1 = vcombine_s16(acc_s16[2], acc_s16[3]); |
| const uint8x8_t res_u8_0 = vqmovun_s16(res_s16_0); |
| const uint8x8_t res_u8_1 = vqmovun_s16(res_s16_1); |
| vst1q_u8(output_ptr, vcombine_u8(res_u8_0, res_u8_1)); |
| output_ptr += 16; |
| } |
| // Handle 8 values at once. |
| // Not as good as 16 (now we're only issuing 2 mutually independent |
| // vqrdmulh instructions, so we're probably paying for their high |
| // latency). |
| for (; i <= num_output_values - 8; i += 8) { |
| int32x4_t acc0 = vld1q_s32(acc_buffer + i); |
| int32x4_t acc1 = vld1q_s32(acc_buffer + i + 4); |
| // Fixed-point multiplication. |
| acc0 = vqrdmulhq_n_s32(acc0, output_multiplier); |
| acc1 = vqrdmulhq_n_s32(acc1, output_multiplier); |
| // Rounding right shift. |
| acc0 = RoundingDivideByPOT(acc0, output_shift); |
| acc1 = RoundingDivideByPOT(acc1, output_shift); |
| // Add the output offset. |
| acc0 = vaddq_s32(acc0, output_offset_vec); |
| acc1 = vaddq_s32(acc1, output_offset_vec); |
| // Apply the activation function. |
| acc0 = vmaxq_s32(acc0, output_activation_min_vec); |
| acc1 = vmaxq_s32(acc1, output_activation_min_vec); |
| acc0 = vminq_s32(acc0, output_activation_max_vec); |
| acc1 = vminq_s32(acc1, output_activation_max_vec); |
| // Saturating cast to uint8 and store to destination. |
| const int16x4_t acc0_s16 = vqmovn_s32(acc0); |
| const int16x4_t acc1_s16 = vqmovn_s32(acc1); |
| const int16x8_t res_s16 = vcombine_s16(acc0_s16, acc1_s16); |
| const uint8x8_t res_u8 = vqmovun_s16(res_s16); |
| vst1_u8(output_ptr, res_u8); |
| output_ptr += 8; |
| } |
| // Handle 4 values at once. Now we're paying the full price of the |
| // high latency of vqrdmulh. Also, storing only 4 bytes at the end |
| // (without any alignment) can only be done 1 byte at a time. |
| // Yet, that is still worth doing to minimize the amount of leftover |
| // that will have to go through the very slow scalar code. |
| for (; i <= num_output_values - 4; i += 4) { |
| int32x4_t acc = vld1q_s32(acc_buffer + i); |
| // Fixed-point multiplication. |
| acc = vqrdmulhq_n_s32(acc, output_multiplier); |
| // Rounding right shift. |
| acc = RoundingDivideByPOT(acc, output_shift); |
| // Add the output offset. |
| acc = vaddq_s32(acc, output_offset_vec); |
| // Apply the activation function. |
| acc = vmaxq_s32(acc, output_activation_min_vec); |
| acc = vminq_s32(acc, output_activation_max_vec); |
| // Saturating cast to uint8 and store to destination. |
| const int16x4_t acc_s16 = vqmovn_s32(acc); |
| const int16x8_t res_s16 = vcombine_s16(acc_s16, acc_s16); |
| const uint8x8_t res_u8 = vqmovun_s16(res_s16); |
| vst1_lane_u8(output_ptr + 0, res_u8, 0); |
| vst1_lane_u8(output_ptr + 1, res_u8, 1); |
| vst1_lane_u8(output_ptr + 2, res_u8, 2); |
| vst1_lane_u8(output_ptr + 3, res_u8, 3); |
| output_ptr += 4; |
| } |
| #endif // USE_NEON |
| |
| // Handle leftover values, one by one. This is very slow. |
| for (; i < num_output_values; i++) { |
| int32 acc = acc_buffer[i]; |
| acc = MultiplyByQuantizedMultiplierSmallerThanOne( |
| acc, output_multiplier, output_shift); |
| acc += output_offset; |
| acc = std::max(acc, output_activation_min); |
| acc = std::min(acc, output_activation_max); |
| *output_ptr++ = static_cast<uint8>(acc); |
| } |
| } |
| } |
| } |
| } |
| |
| // Legacy, for compatibility with old checked-in code. |
| template <FusedActivationFunctionType Ac> |
| void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, |
| int32 input_offset, const uint8* filter_data, |
| const Dims<4>& filter_dims, int32 filter_offset, |
| const int32* bias_data, const Dims<4>& bias_dims, |
| int stride_width, int stride_height, int pad_width, |
| int pad_height, int depth_multiplier, int32 output_offset, |
| int32 output_multiplier, int output_shift, |
| int32 output_activation_min, int32 output_activation_max, |
| uint8* output_data, const Dims<4>& output_dims) { |
| if (Ac == FusedActivationFunctionType::kNone) { |
| TFLITE_DCHECK_EQ(output_activation_min, 0); |
| TFLITE_DCHECK_EQ(output_activation_max, 255); |
| } |
| DepthwiseConv(input_data, input_dims, input_offset, filter_data, filter_dims, |
| filter_offset, bias_data, bias_dims, stride_width, |
| stride_height, pad_width, pad_height, depth_multiplier, |
| output_offset, output_multiplier, output_shift, |
| output_activation_min, output_activation_max, output_data, |
| output_dims); |
| } |
| |
| // Legacy, for compatibility with old checked-in code. |
| template <FusedActivationFunctionType Ac> |
| void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, |
| int32 input_offset, const uint8* filter_data, |
| const Dims<4>& filter_dims, int32 filter_offset, |
| const int32* bias_data, const Dims<4>& bias_dims, int stride, |
| int pad_width, int pad_height, int depth_multiplier, |
| int32 output_offset, int32 output_multiplier, |
| int output_shift, int32 output_activation_min, |
| int32 output_activation_max, uint8* output_data, |
| const Dims<4>& output_dims) { |
| DepthwiseConv<Ac>(input_data, input_dims, input_offset, filter_data, |
| filter_dims, filter_offset, bias_data, bias_dims, stride, |
| stride, pad_width, pad_height, depth_multiplier, |
| output_offset, output_multiplier, output_shift, |
| output_activation_min, output_activation_max, output_data, |
| output_dims); |
| } |
| |
| } // namespace optimized_ops |
| } // namespace tflite |
| |
| #endif // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_ |