| /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. |
| |
| Licensed under the Apache License, Version 2.0 (the "License"); |
| you may not use this file except in compliance with the License. |
| You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| ==============================================================================*/ |
| #include <string.h> |
| |
| #include "tensorflow/contrib/lite/builtin_op_data.h" |
| #include "tensorflow/contrib/lite/kernels/internal/common.h" |
| #include "tensorflow/contrib/lite/kernels/activation_functor.h" |
| #include "tensorflow/contrib/lite/kernels/internal/common.h" |
| #include "tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h" |
| |
| #ifdef USE_NEON |
| |
| #define kFloatWeightsPerNeonLane 4 |
| |
| namespace tflite { |
| namespace tensor_utils { |
| |
| void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows, |
| int m_cols, const float* vector, |
| int n_batch, float* result, |
| int result_stride) { |
| // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main |
| // vectorized loop, and we need to process sequentially. postamble_start shows |
| // the start index where this should happen. |
| const int postamble_start = |
| m_cols - (m_cols & (kFloatWeightsPerNeonLane - 1)); |
| |
| // The arrays used to cache the vector. |
| float32x4_t* vector_cache_float32x4 = |
| new float32x4_t[(m_cols / kFloatWeightsPerNeonLane) * |
| sizeof(float32x4_t)]; |
| const int kUnrollSize = 2; |
| for (int b = 0; b < n_batch; b++) { |
| float* result_in_batch = result + b * m_rows * result_stride; |
| const float* vector_in_batch = vector + b * m_cols; |
| |
| const float* matrix_ptr0 = matrix; |
| // If there is only 1 row, we don't want to assign an illegal pointer. |
| const float* matrix_ptr1 = nullptr; |
| if (m_rows > 1) { |
| matrix_ptr1 = matrix + m_cols; |
| } |
| |
| // Cahce the vector. |
| for (int c = 0; c < postamble_start; c += kFloatWeightsPerNeonLane) { |
| vector_cache_float32x4[c >> 2] = vld1q_f32(vector_in_batch + c); |
| } |
| |
| // Main matrix by vector multiplication loop, which handles two rows of |
| // matrix by vector multiplication. |
| for (int r = 0; r < (m_rows & ~(kUnrollSize - 1)); r += kUnrollSize) { |
| float32x4_t acc0_32x4 = vmovq_n_f32(0.0); |
| float32x4_t acc1_32x4 = vmovq_n_f32(0.0); |
| for (int c = 0; c < postamble_start; c += kFloatWeightsPerNeonLane) { |
| float32x4_t temp = vector_cache_float32x4[c >> 2]; |
| // Load 4 float values from vector1 and vector2 and accumulator. |
| float32x4_t v0_f32x4 = vld1q_f32(matrix_ptr0 + c); |
| float32x4_t v1_f32x4 = vld1q_f32(matrix_ptr1 + c); |
| // Vector multiply-accumulate 4 float |
| acc0_32x4 = vmlaq_f32(acc0_32x4, v0_f32x4, temp); |
| acc1_32x4 = vmlaq_f32(acc1_32x4, v1_f32x4, temp); |
| } |
| // Add the 4 intermediate sum values to get the final dot-prod value for |
| // this column. |
| *result_in_batch += |
| (vgetq_lane_f32(acc0_32x4, 0) + vgetq_lane_f32(acc0_32x4, 1) + |
| vgetq_lane_f32(acc0_32x4, 2) + vgetq_lane_f32(acc0_32x4, 3)); |
| *(result_in_batch + result_stride) += |
| (vgetq_lane_f32(acc1_32x4, 0) + vgetq_lane_f32(acc1_32x4, 1) + |
| vgetq_lane_f32(acc1_32x4, 2) + vgetq_lane_f32(acc1_32x4, 3)); |
| for (int c = postamble_start; c < m_cols; c++) { |
| *result_in_batch += matrix_ptr0[c] * vector_in_batch[c]; |
| *(result_in_batch + result_stride) += |
| matrix_ptr1[c] * vector_in_batch[c]; |
| } |
| matrix_ptr0 += kUnrollSize * m_cols; |
| matrix_ptr1 += kUnrollSize * m_cols; |
| result_in_batch += kUnrollSize * result_stride; |
| } |
| for (int r = (m_rows & ~(kUnrollSize - 1)); r < m_rows; r++) { |
| float32x4_t acc0_32x4 = vmovq_n_f32(0.0); |
| for (int c = 0; c < postamble_start; c += kFloatWeightsPerNeonLane) { |
| float32x4_t temp = vector_cache_float32x4[c >> 2]; |
| // Load 4 float values from vector1 and vector2 and accumulator. |
| float32x4_t v0_f32x4 = vld1q_f32(matrix_ptr0 + c); |
| // Vector multiply-accumulate 4 float |
| acc0_32x4 = vmlaq_f32(acc0_32x4, v0_f32x4, temp); |
| } |
| // Add the 4 intermediate sum values to get the final dot-prod value for |
| // this column. |
| *result_in_batch += |
| (vgetq_lane_f32(acc0_32x4, 0) + vgetq_lane_f32(acc0_32x4, 1) + |
| vgetq_lane_f32(acc0_32x4, 2) + vgetq_lane_f32(acc0_32x4, 3)); |
| for (int c = postamble_start; c < m_cols; c++) { |
| *result_in_batch += matrix_ptr0[c] * vector_in_batch[c]; |
| } |
| matrix_ptr0 += m_cols; |
| result_in_batch += result_stride; |
| } |
| } |
| delete[] vector_cache_float32x4; |
| } |
| |
| void NeonVectorVectorCwiseProduct(const float* vector1, const float* vector2, |
| int v_size, float* result) { |
| // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main |
| // vectorized loop, and we need to process sequentially. postamble_start shows |
| // the start index where this should happen. |
| const int postamble_start = |
| v_size - (v_size & (kFloatWeightsPerNeonLane - 1)); |
| for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) { |
| // Load 4 float values from vector1 and vector2. |
| float32x4_t v1_f32x4 = vld1q_f32(vector1 + v); |
| float32x4_t v2_f32x4 = vld1q_f32(vector2 + v); |
| // Vector multiply 4 float |
| float32x4_t mul_32x4 = vmulq_f32(v1_f32x4, v2_f32x4); |
| // Save to result array. |
| vst1q_f32(&result[v], mul_32x4); |
| } |
| for (int v = postamble_start; v < v_size; v++) { |
| result[v] = vector1[v] * vector2[v]; |
| } |
| } |
| |
| void NeonVectorVectorCwiseProductAccumulate(const float* vector1, |
| const float* vector2, int v_size, |
| float* result) { |
| // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main |
| // vectorized loop, and we need to process sequentially. postamble_start shows |
| // the start index where this should happen. |
| const int postamble_start = |
| v_size - (v_size & (kFloatWeightsPerNeonLane - 1)); |
| for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) { |
| // Load 4 float values from vector1 and vector2 and accumulator. |
| float32x4_t v1_f32x4 = vld1q_f32(vector1 + v); |
| float32x4_t v2_f32x4 = vld1q_f32(vector2 + v); |
| float32x4_t acc_32x4 = vld1q_f32(result + v); |
| // Vector multiply-accumulate 4 float |
| acc_32x4 = vmlaq_f32(acc_32x4, v1_f32x4, v2_f32x4); |
| // Save to result array. |
| vst1q_f32(&result[v], acc_32x4); |
| } |
| for (int v = postamble_start; v < v_size; v++) { |
| result[v] += vector1[v] * vector2[v]; |
| } |
| } |
| |
| void NeonVectorBatchVectorCwiseProductAccumulate(const float* vector, |
| int v_size, |
| const float* batch_vector, |
| int n_batch, float* result) { |
| // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main |
| // vectorized loop, and we need to process sequentially. postamble_start shows |
| // the start index where this should happen. |
| const int postamble_start = |
| v_size - (v_size & (kFloatWeightsPerNeonLane - 1)); |
| |
| // The arrays used to cache the vector. |
| float32x4_t* vector_cache_float32x4 = |
| new float32x4_t[(v_size / kFloatWeightsPerNeonLane) * |
| sizeof(float32x4_t)]; |
| for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) { |
| vector_cache_float32x4[v >> 2] = vld1q_f32(vector + v); |
| } |
| |
| float* result_ptr = result; |
| const float* batch_vector_ptr = batch_vector; |
| for (int b = 0; b < n_batch; b++) { |
| for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) { |
| // Load from memory to vectors. |
| float32x4_t result_f32x4 = vld1q_f32(result_ptr + v); |
| float32x4_t batch_vector_f32x4 = vld1q_f32(batch_vector_ptr + v); |
| // Multiply-accumulate. |
| result_f32x4 = vmlaq_f32(result_f32x4, batch_vector_f32x4, |
| vector_cache_float32x4[v >> 2]); |
| // Store. |
| vst1q_f32(result_ptr + v, result_f32x4); |
| } |
| // Postamble loop |
| for (int v = postamble_start; v < v_size; v++) { |
| result_ptr[v] += vector[v] * batch_vector_ptr[v]; |
| } |
| // Update the pointers. |
| result_ptr += v_size; |
| batch_vector_ptr += v_size; |
| } |
| delete[] vector_cache_float32x4; |
| } |
| |
| void NeonSub1Vector(const float* vector, int v_size, float* result) { |
| // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main |
| // vectorized loop, and we need to process sequentially. postamble_start shows |
| // the start index where this should happen. |
| const int postamble_start = |
| v_size - (v_size & (kFloatWeightsPerNeonLane - 1)); |
| |
| float32x4_t one_f32x4 = vmovq_n_f32(1.0); |
| for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) { |
| // Load 4 float values from the current pointers of the input column and |
| // subtract from 1. |
| float32x4_t v_f32x4 = vld1q_f32(vector + v); |
| float32x4_t result_f32x4 = vsubq_f32(one_f32x4, v_f32x4); |
| // Save to output. |
| vst1q_f32(result + v, result_f32x4); |
| } |
| for (int v = postamble_start; v < v_size; v++) { |
| result[v] = 1.0f - vector[v]; |
| } |
| } |
| |
| void NeonClipVector(const float* vector, int v_size, float abs_limit, |
| float* result) { |
| // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main |
| // vectorized loop, and we need to process sequentially. postamble_start shows |
| // the start index where this should happen. |
| const int postamble_start = |
| v_size - (v_size & (kFloatWeightsPerNeonLane - 1)); |
| |
| // Replicate abs_limit and -abs_limit in two vectors. |
| const float32x4_t abs_limit_f32x4 = vmovq_n_f32(abs_limit); |
| const float32x4_t neg_abs_limit_f32x4 = vmovq_n_f32(-abs_limit); |
| |
| for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) { |
| // Load from memory to vector. |
| float32x4_t v_f32x4 = vld1q_f32(vector + v); |
| // Clip between abs_limit and -abs_limit. |
| float32x4_t result_f32x4 = vminq_f32(abs_limit_f32x4, v_f32x4); |
| result_f32x4 = vmaxq_f32(neg_abs_limit_f32x4, result_f32x4); |
| // Save to output. |
| vst1q_f32(result + v, result_f32x4); |
| } |
| // Postamble loop. |
| for (int v = postamble_start; v < v_size; v++) { |
| result[v] = (abs_limit < vector[v]) ? abs_limit : vector[v]; |
| result[v] = (-abs_limit > result[v]) ? -abs_limit : result[v]; |
| } |
| } |
| |
| float NeonVectorVectorDotProduct(const float* vector1, const float* vector2, |
| int v_size) { |
| // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main |
| // vectorized loop, and we need to process sequentially. postamble_start shows |
| // the start index where this should happen. |
| const int postamble_start = |
| v_size - (v_size & (kFloatWeightsPerNeonLane - 1)); |
| float32x4_t acc_32x4 = vmovq_n_f32(0.0); |
| for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) { |
| // Load 4 float values from vector1 and vector2 and accumulator. |
| float32x4_t v1_f32x4 = vld1q_f32(vector1 + v); |
| float32x4_t v2_f32x4 = vld1q_f32(vector2 + v); |
| // Vector multiply-accumulate 4 float |
| acc_32x4 = vmlaq_f32(acc_32x4, v1_f32x4, v2_f32x4); |
| } |
| |
| float result = (vgetq_lane_f32(acc_32x4, 0) + vgetq_lane_f32(acc_32x4, 1) + |
| vgetq_lane_f32(acc_32x4, 2) + vgetq_lane_f32(acc_32x4, 3)); |
| // Postamble loop. |
| for (int v = postamble_start; v < v_size; v++) { |
| result += vector1[v] * vector2[v]; |
| } |
| return result; |
| } |
| |
| void NeonBatchVectorBatchVectorDotProduct(const float* vector1, |
| const float* vector2, int v_size, |
| int n_batch, float* result, |
| int result_stride) { |
| float* result_ptr = result; |
| const float* vector1_ptr = vector1; |
| const float* vector2_ptr = vector2; |
| for (int b = 0; b < n_batch; b++) { |
| *result_ptr = NeonVectorVectorDotProduct(vector1_ptr, vector2_ptr, v_size); |
| vector1_ptr += v_size; |
| vector2_ptr += v_size; |
| result_ptr += result_stride; |
| } |
| } |
| |
| void NeonReductionSumVector(const float* input_vector, float* output_vector, |
| int output_size, int reduction_size) { |
| const float* input_vector_ptr = input_vector; |
| for (int o = 0; o < output_size; o++) { |
| // If reduction_size is not divisible by kWeightsPerNeonLane, we cannot use |
| // the main vectorized loop, and we need to process sequentially. |
| // postamble_start shows the start index where this should happen. |
| const int postamble_start = |
| reduction_size - (reduction_size & (kFloatWeightsPerNeonLane - 1)); |
| float32x4_t sum_f32x4 = vmovq_n_f32(0.0); |
| for (int r = 0; r < postamble_start; r += kFloatWeightsPerNeonLane) { |
| float32x4_t v1_f32x4 = vld1q_f32(input_vector_ptr + r); |
| sum_f32x4 = vaddq_f32(sum_f32x4, v1_f32x4); |
| } |
| output_vector[o] += |
| (vgetq_lane_f32(sum_f32x4, 0) + vgetq_lane_f32(sum_f32x4, 1) + |
| vgetq_lane_f32(sum_f32x4, 2) + vgetq_lane_f32(sum_f32x4, 3)); |
| input_vector_ptr += postamble_start; |
| |
| // Postamble loop. |
| for (int r = postamble_start; r < reduction_size; r++) { |
| output_vector[o] += *input_vector_ptr++; |
| } |
| } |
| } |
| |
| void NeonVectorShiftLeft(float* vector, int v_size, float shift_value) { |
| // This variable keeps track of the next to the last index which is being |
| // copied to make sure we are not out of the vector boundary. |
| int last_index_copy = kFloatWeightsPerNeonLane; |
| int current_index_copy = 0; |
| while (last_index_copy < v_size) { |
| float32x4_t v_f32x4 = vld1q_f32(vector + current_index_copy + 1); |
| vst1q_f32(vector + current_index_copy, v_f32x4); |
| current_index_copy += kFloatWeightsPerNeonLane; |
| last_index_copy += kFloatWeightsPerNeonLane; |
| } |
| // Postamble loop. |
| for (int i = current_index_copy; i < v_size - 1; i++) { |
| vector[i] = vector[i + 1]; |
| } |
| vector[v_size - 1] = shift_value; |
| } |
| |
| } // namespace tensor_utils |
| } // namespace tflite |
| |
| #endif // USE_NEON |