internal/output_neon.h - platform/external/gemmlowp - Git at Google

 // Copyright 2015 Google Inc. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 // output_neon.h: optimized NEON specializations of the templates in output.h.

 #ifndef GEMMLOWP_INTERNAL_OUTPUT_NEON_H_
 #define GEMMLOWP_INTERNAL_OUTPUT_NEON_H_

 #include "output.h"

 #include <arm_neon.h>

 namespace gemmlowp {

 // Definitions of Fragment types wrapping NEON vector types.
 typedef Fragment<int32x4_t, 4, 1, MapOrder::ColMajor> NEONFragmentInt32x4x1;
 typedef Fragment<int32x4x4_t, 16, 1, MapOrder::ColMajor> NEONFragmentInt32x16x1;
 typedef Fragment<uint8x8_t, 4, 1, MapOrder::ColMajor> NEONFragmentUint8x4x1;
 typedef Fragment<uint8x16_t, 16, 1, MapOrder::ColMajor> NEONFragmentUint8x16x1;

 // The code in unpack_neon.h will whenever possible process
 // 16 entries at once (4 SIMD vectors of 4 entries each at once),
 // to offer the compiler better optimization opportunities, reducing
 // register dependencies. From the perspective of interfacing with the output
 // pipeline, this takes the form of passing Fragment types wrapping int32x4x4_t
 // data. In most cases, such data is handled simply by handling separately its
 // 4 int32x4_t components. This partial specialization handles that for
 // arbitrary output stages implementing a int32x4_t path. Only some output
 // stages below will override this to use custom code to handle int32x4x4_t
 // data all at once (see OutputStageSaturatingCastToUint8 below).
 template <typename OutputStageType>
 struct OutputStageEvalImpl<OutputStageType, NEONFragmentInt32x16x1> {
   typedef NEONFragmentInt32x16x1 InputType;
   typedef NEONFragmentInt32x16x1 OutputType;
   typedef OutputStageEvalImpl<OutputStageType, NEONFragmentInt32x4x1>
       ImplInt32x4;
   OutputStageEvalImpl(const OutputStageType& s) : impl_int32x4(s) {}

   OutputType Eval(InputType input, int row, int col) const {
     OutputType output;

     for (int i = 0; i < 4; i++) {
       output.data.val[i] =
           impl_int32x4.Eval(input.data.val[i], row + 4 * i, col);
     }
     return output;
   }

   ImplInt32x4 impl_int32x4;
 };

 // Implementation of OutputStageQuantizeDownInt32ToUint8Scale for
 // NEONFragmentInt32x4x1
 template <>
 struct OutputStageEvalImpl<OutputStageQuantizeDownInt32ToUint8Scale,
                            NEONFragmentInt32x4x1> {
   typedef NEONFragmentInt32x4x1 InputType;
   typedef NEONFragmentInt32x4x1 OutputType;
   typedef OutputStageQuantizeDownInt32ToUint8Scale OutputStage;

   OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}

   OutputType Eval(InputType input, int, int) const {
     const std::int32_t result_shift = output_stage.result_shift;
     const std::int32_t result_mult_int = output_stage.result_mult_int;
     const std::int32_t result_offset = output_stage.result_offset;
     const std::int32_t preshift_offset =
         (result_shift < 1) ? 0 : (1 << (result_shift - 1));
     const int32x4_t a = vaddq_s32(input, vdupq_n_s32(result_offset));
     const int32x4_t b =
         vmlaq_n_s32(vdupq_n_s32(preshift_offset), a, result_mult_int);
     return vshlq_s32(b, vdupq_n_s32(-result_shift));
   }

   const OutputStage& output_stage;
 };

 // Implementation of OutputStageQuantizeDownInt32ToUint8ScalePC for
 // NEONFragmentInt32x4x1
 template <>
 struct OutputStageEvalImpl<
     OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Col>,
     NEONFragmentInt32x4x1> {
   typedef NEONFragmentInt32x4x1 InputType;
   typedef NEONFragmentInt32x4x1 OutputType;
   typedef OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Col>
       OutputStage;

   OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}

   OutputType Eval(InputType input, int row, int col) const {
     const std::int32_t result_shift = output_stage.result_shift;
     const std::int32_t preshift_offset =
         (result_shift < 1) ? 0 : (1 << (result_shift - 1));
     const int32x4_t result_mult_int =
         vld1q_s32(output_stage.result_mult_int.data(row));
     const int32x4_t result_offset =
         vld1q_s32(output_stage.result_offset.data(row));
     const int32x4_t a = vaddq_s32(input, result_offset);
     const int32x4_t b =
         vmlaq_s32(vdupq_n_s32(preshift_offset), a, result_mult_int);
     return vshlq_s32(b, vdupq_n_s32(-result_shift));
   }

   const OutputStage& output_stage;
 };

 // Implementation of OutputStageQuantizeDownInt32ToUint8ScalePC for
 // NEONFragmentInt32x4x1
 template <>
 struct OutputStageEvalImpl<
     OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Row>,
     NEONFragmentInt32x4x1> {
   typedef NEONFragmentInt32x4x1 InputType;
   typedef NEONFragmentInt32x4x1 OutputType;
   typedef OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Row>
       OutputStage;

   OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}

   OutputType Eval(InputType input, int row, int col) const {
     const std::int32_t result_shift = output_stage.result_shift;
     const std::int32_t preshift_offset =
         (result_shift < 1) ? 0 : (1 << (result_shift - 1));
     const int32x4_t result_mult_int =
         vld1q_s32(output_stage.result_mult_int.data(col));
     const int32x4_t result_offset =
         vld1q_s32(output_stage.result_offset.data(row));
     const int32x4_t a = vaddq_s32(input, result_offset);
     const int32x4_t b =
         vmlaq_s32(vdupq_n_s32(preshift_offset), a, result_mult_int);
     return vshlq_s32(b, vdupq_n_s32(-result_shift));
   }

   const OutputStage& output_stage;
 };

 // Implementation of OutputStageSaturatingCastToUint8 for NEONFragmentInt32x4x1
 template <>
 struct OutputStageEvalImpl<OutputStageSaturatingCastToUint8,
                            NEONFragmentInt32x4x1> {
   typedef NEONFragmentInt32x4x1 InputType;
   typedef NEONFragmentUint8x4x1 OutputType;
   typedef OutputStageSaturatingCastToUint8 OutputStage;

   OutputStageEvalImpl(const OutputStage&) {}

   OutputType Eval(InputType input, int, int) const {
     int16x8_t q16 = vcombine_s16(vqmovn_s32(input), vdup_n_s16(0));
     return vqmovun_s16(q16);
   }
 };

 // In the case of OutputStageSaturatingCastToUint8, the handling of
 // NEONFragmentInt32x16x1 data can be made much more efficient by handling
 // it all at once, instead of as 4 separate int32x4 values as in the above
 // generic partial specialization. This also avoids the poor (50%) register
 // utilization of FragmentUint8x4x1: by handling 16 scalar values at once,
 // we are able to fill a uint8x16_t.
 template <>
 struct OutputStageEvalImpl<OutputStageSaturatingCastToUint8,
                            NEONFragmentInt32x16x1> {
   typedef NEONFragmentInt32x16x1 InputType;
   typedef NEONFragmentUint8x16x1 OutputType;
   typedef OutputStageSaturatingCastToUint8 OutputStage;

   OutputStageEvalImpl(const OutputStage&) {}

   OutputType Eval(InputType input, int, int) const {
     int16x8_t q16[2];
     for (int i = 0; i < 2; i++) {
       q16[i] = vcombine_s16(vqmovn_s32(input.data.val[2 * i]),
                             vqmovn_s32(input.data.val[2 * i + 1]));
     }
     return vcombine_u8(vqmovun_s16(q16[0]), vqmovun_s16(q16[1]));
   }
 };

 // Implementation of OutputStageBiasAddition for NEONFragmentInt32x4x1
 template <typename VectorType>
 struct OutputStageEvalImpl<OutputStageBiasAddition<VectorType>,
                            NEONFragmentInt32x4x1> {
   typedef NEONFragmentInt32x4x1 InputType;
   typedef NEONFragmentInt32x4x1 OutputType;
   typedef OutputStageBiasAddition<VectorType> OutputStage;

   OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}

   OutputType Eval(InputType input, int row, int col) const {
     int32x4_t bias;
     if (VectorType::kShape == VectorShape::Row) {
       bias = vdupq_n_s32(output_stage.bias_vector(col));
     } else {
       bias = vld1q_s32(output_stage.bias_vector.data(row));
     }
     return vaddq_s32(input, bias);
   }

   const OutputStage& output_stage;
 };

 // Implementation of OutputStageClamp for NEONFragmentInt32x4x1
 template <>
 struct OutputStageEvalImpl<OutputStageClamp, NEONFragmentInt32x4x1> {
   typedef NEONFragmentInt32x4x1 InputType;
   typedef NEONFragmentInt32x4x1 OutputType;
   typedef OutputStageClamp OutputStage;

   OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}

   OutputType Eval(InputType input, int, int) const {
     const int32x4_t min = vdupq_n_s32(output_stage.min);
     const int32x4_t max = vdupq_n_s32(output_stage.max);
     return vminq_s32(vmaxq_s32(input, min), max);
   }

   const OutputStage& output_stage;
 };

 // Implementation of OutputStageTanh for NEONFragmentInt32x4x1
 template <>
 struct OutputStageEvalImpl<OutputStageTanh, NEONFragmentInt32x4x1>
     : OutputStageTanhEvalImpl<NEONFragmentInt32x4x1> {
   OutputStageEvalImpl(const OutputStageTanh& output_stage)
       : OutputStageTanhEvalImpl(output_stage) {}
 };

 // Specialization of StoreFinalOutput for NEONFragmentUint8x4x1.
 // This is quite inefficient, but we have no choice: instructions storing 32bit
 // at once also assume 32bit alignment. In practice, this slowness is not a
 // problem because we use the x16 path for most values.
 template <typename DstType>
 inline void StoreFinalOutput(NEONFragmentUint8x4x1 value, DstType* dst, int row,
                              int col) {
   vst1_lane_u8(dst->data(row + 0, col), value, 0);
   vst1_lane_u8(dst->data(row + 1, col), value, 1);
   vst1_lane_u8(dst->data(row + 2, col), value, 2);
   vst1_lane_u8(dst->data(row + 3, col), value, 3);
 }

 // Specialization of StoreFinalOutput for NEONFragmentUint8x16x1.
 template <typename DstType>
 inline void StoreFinalOutput(NEONFragmentUint8x16x1 value, DstType* dst,
                              int row, int col) {
   vst1q_u8(dst->data(row, col), value);
 }

 // Specialization of StoreFinalOutput for NEONFragmentInt32x4x1, storing into a
 // int32 destination.
 template <typename DstType>
 inline void StoreFinalOutput(NEONFragmentInt32x4x1 value, DstType* dst, int row,
                              int col) {
   vst1q_s32(dst->data(row, col), value);
 }

 // Specialization of StoreFinalOutput for NEONFragmentInt32x16x1, storing into
 // a int32 destination.
 template <typename DstType>
 inline void StoreFinalOutput(NEONFragmentInt32x16x1 value, DstType* dst,
                              int row, int col) {
   for (int i = 0; i < 4; i++) {
     vst1q_s32(dst->data(row + 4 * i, col), value.data.val[i]);
   }
 }

 }  // namespace gemmlowp

 #endif  // GEMMLOWP_INTERNAL_OUTPUT_NEON_H_
	// Copyright 2015 Google Inc. All Rights Reserved.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	// output_neon.h: optimized NEON specializations of the templates in output.h.

	#ifndef GEMMLOWP_INTERNAL_OUTPUT_NEON_H_
	#define GEMMLOWP_INTERNAL_OUTPUT_NEON_H_

	#include "output.h"

	#include <arm_neon.h>

	namespace gemmlowp {

	// Definitions of Fragment types wrapping NEON vector types.
	typedef Fragment<int32x4_t, 4, 1, MapOrder::ColMajor> NEONFragmentInt32x4x1;
	typedef Fragment<int32x4x4_t, 16, 1, MapOrder::ColMajor> NEONFragmentInt32x16x1;
	typedef Fragment<uint8x8_t, 4, 1, MapOrder::ColMajor> NEONFragmentUint8x4x1;
	typedef Fragment<uint8x16_t, 16, 1, MapOrder::ColMajor> NEONFragmentUint8x16x1;

	// The code in unpack_neon.h will whenever possible process
	// 16 entries at once (4 SIMD vectors of 4 entries each at once),
	// to offer the compiler better optimization opportunities, reducing
	// register dependencies. From the perspective of interfacing with the output
	// pipeline, this takes the form of passing Fragment types wrapping int32x4x4_t
	// data. In most cases, such data is handled simply by handling separately its
	// 4 int32x4_t components. This partial specialization handles that for
	// arbitrary output stages implementing a int32x4_t path. Only some output
	// stages below will override this to use custom code to handle int32x4x4_t
	// data all at once (see OutputStageSaturatingCastToUint8 below).
	template <typename OutputStageType>
	struct OutputStageEvalImpl<OutputStageType, NEONFragmentInt32x16x1> {
	typedef NEONFragmentInt32x16x1 InputType;
	typedef NEONFragmentInt32x16x1 OutputType;
	typedef OutputStageEvalImpl<OutputStageType, NEONFragmentInt32x4x1>
	ImplInt32x4;
	OutputStageEvalImpl(const OutputStageType& s) : impl_int32x4(s) {}

	OutputType Eval(InputType input, int row, int col) const {
	OutputType output;

	for (int i = 0; i < 4; i++) {
	output.data.val[i] =
	impl_int32x4.Eval(input.data.val[i], row + 4 * i, col);
	}
	return output;
	}

	ImplInt32x4 impl_int32x4;
	};

	// Implementation of OutputStageQuantizeDownInt32ToUint8Scale for
	// NEONFragmentInt32x4x1
	template <>
	struct OutputStageEvalImpl<OutputStageQuantizeDownInt32ToUint8Scale,
	NEONFragmentInt32x4x1> {
	typedef NEONFragmentInt32x4x1 InputType;
	typedef NEONFragmentInt32x4x1 OutputType;
	typedef OutputStageQuantizeDownInt32ToUint8Scale OutputStage;

	OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}

	OutputType Eval(InputType input, int, int) const {
	const std::int32_t result_shift = output_stage.result_shift;
	const std::int32_t result_mult_int = output_stage.result_mult_int;
	const std::int32_t result_offset = output_stage.result_offset;
	const std::int32_t preshift_offset =
	(result_shift < 1) ? 0 : (1 << (result_shift - 1));
	const int32x4_t a = vaddq_s32(input, vdupq_n_s32(result_offset));
	const int32x4_t b =
	vmlaq_n_s32(vdupq_n_s32(preshift_offset), a, result_mult_int);
	return vshlq_s32(b, vdupq_n_s32(-result_shift));
	}

	const OutputStage& output_stage;
	};

	// Implementation of OutputStageQuantizeDownInt32ToUint8ScalePC for
	// NEONFragmentInt32x4x1
	template <>
	struct OutputStageEvalImpl<
	OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Col>,
	NEONFragmentInt32x4x1> {
	typedef NEONFragmentInt32x4x1 InputType;
	typedef NEONFragmentInt32x4x1 OutputType;
	typedef OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Col>
	OutputStage;

	OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}

	OutputType Eval(InputType input, int row, int col) const {
	const std::int32_t result_shift = output_stage.result_shift;
	const std::int32_t preshift_offset =
	(result_shift < 1) ? 0 : (1 << (result_shift - 1));
	const int32x4_t result_mult_int =
	vld1q_s32(output_stage.result_mult_int.data(row));
	const int32x4_t result_offset =
	vld1q_s32(output_stage.result_offset.data(row));
	const int32x4_t a = vaddq_s32(input, result_offset);
	const int32x4_t b =
	vmlaq_s32(vdupq_n_s32(preshift_offset), a, result_mult_int);
	return vshlq_s32(b, vdupq_n_s32(-result_shift));
	}

	const OutputStage& output_stage;
	};

	// Implementation of OutputStageQuantizeDownInt32ToUint8ScalePC for
	// NEONFragmentInt32x4x1
	template <>
	struct OutputStageEvalImpl<
	OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Row>,
	NEONFragmentInt32x4x1> {
	typedef NEONFragmentInt32x4x1 InputType;
	typedef NEONFragmentInt32x4x1 OutputType;
	typedef OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Row>
	OutputStage;

	OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}

	OutputType Eval(InputType input, int row, int col) const {
	const std::int32_t result_shift = output_stage.result_shift;
	const std::int32_t preshift_offset =
	(result_shift < 1) ? 0 : (1 << (result_shift - 1));
	const int32x4_t result_mult_int =
	vld1q_s32(output_stage.result_mult_int.data(col));
	const int32x4_t result_offset =
	vld1q_s32(output_stage.result_offset.data(row));
	const int32x4_t a = vaddq_s32(input, result_offset);
	const int32x4_t b =
	vmlaq_s32(vdupq_n_s32(preshift_offset), a, result_mult_int);
	return vshlq_s32(b, vdupq_n_s32(-result_shift));
	}

	const OutputStage& output_stage;
	};

	// Implementation of OutputStageSaturatingCastToUint8 for NEONFragmentInt32x4x1
	template <>
	struct OutputStageEvalImpl<OutputStageSaturatingCastToUint8,
	NEONFragmentInt32x4x1> {
	typedef NEONFragmentInt32x4x1 InputType;
	typedef NEONFragmentUint8x4x1 OutputType;
	typedef OutputStageSaturatingCastToUint8 OutputStage;

	OutputStageEvalImpl(const OutputStage&) {}

	OutputType Eval(InputType input, int, int) const {
	int16x8_t q16 = vcombine_s16(vqmovn_s32(input), vdup_n_s16(0));
	return vqmovun_s16(q16);
	}
	};

	// In the case of OutputStageSaturatingCastToUint8, the handling of
	// NEONFragmentInt32x16x1 data can be made much more efficient by handling
	// it all at once, instead of as 4 separate int32x4 values as in the above
	// generic partial specialization. This also avoids the poor (50%) register
	// utilization of FragmentUint8x4x1: by handling 16 scalar values at once,
	// we are able to fill a uint8x16_t.
	template <>
	struct OutputStageEvalImpl<OutputStageSaturatingCastToUint8,
	NEONFragmentInt32x16x1> {
	typedef NEONFragmentInt32x16x1 InputType;
	typedef NEONFragmentUint8x16x1 OutputType;
	typedef OutputStageSaturatingCastToUint8 OutputStage;

	OutputStageEvalImpl(const OutputStage&) {}

	OutputType Eval(InputType input, int, int) const {
	int16x8_t q16[2];
	for (int i = 0; i < 2; i++) {
	q16[i] = vcombine_s16(vqmovn_s32(input.data.val[2 * i]),
	vqmovn_s32(input.data.val[2 * i + 1]));
	}
	return vcombine_u8(vqmovun_s16(q16[0]), vqmovun_s16(q16[1]));
	}
	};

	// Implementation of OutputStageBiasAddition for NEONFragmentInt32x4x1
	template <typename VectorType>
	struct OutputStageEvalImpl<OutputStageBiasAddition<VectorType>,
	NEONFragmentInt32x4x1> {
	typedef NEONFragmentInt32x4x1 InputType;
	typedef NEONFragmentInt32x4x1 OutputType;
	typedef OutputStageBiasAddition<VectorType> OutputStage;

	OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}

	OutputType Eval(InputType input, int row, int col) const {
	int32x4_t bias;
	if (VectorType::kShape == VectorShape::Row) {
	bias = vdupq_n_s32(output_stage.bias_vector(col));
	} else {
	bias = vld1q_s32(output_stage.bias_vector.data(row));
	}
	return vaddq_s32(input, bias);
	}

	const OutputStage& output_stage;
	};

	// Implementation of OutputStageClamp for NEONFragmentInt32x4x1
	template <>
	struct OutputStageEvalImpl<OutputStageClamp, NEONFragmentInt32x4x1> {
	typedef NEONFragmentInt32x4x1 InputType;
	typedef NEONFragmentInt32x4x1 OutputType;
	typedef OutputStageClamp OutputStage;

	OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}

	OutputType Eval(InputType input, int, int) const {
	const int32x4_t min = vdupq_n_s32(output_stage.min);
	const int32x4_t max = vdupq_n_s32(output_stage.max);
	return vminq_s32(vmaxq_s32(input, min), max);
	}

	const OutputStage& output_stage;
	};

	// Implementation of OutputStageTanh for NEONFragmentInt32x4x1
	template <>
	struct OutputStageEvalImpl<OutputStageTanh, NEONFragmentInt32x4x1>
	: OutputStageTanhEvalImpl<NEONFragmentInt32x4x1> {
	OutputStageEvalImpl(const OutputStageTanh& output_stage)
	: OutputStageTanhEvalImpl(output_stage) {}
	};

	// Specialization of StoreFinalOutput for NEONFragmentUint8x4x1.
	// This is quite inefficient, but we have no choice: instructions storing 32bit
	// at once also assume 32bit alignment. In practice, this slowness is not a
	// problem because we use the x16 path for most values.
	template <typename DstType>
	inline void StoreFinalOutput(NEONFragmentUint8x4x1 value, DstType* dst, int row,
	int col) {
	vst1_lane_u8(dst->data(row + 0, col), value, 0);
	vst1_lane_u8(dst->data(row + 1, col), value, 1);
	vst1_lane_u8(dst->data(row + 2, col), value, 2);
	vst1_lane_u8(dst->data(row + 3, col), value, 3);
	}

	// Specialization of StoreFinalOutput for NEONFragmentUint8x16x1.
	template <typename DstType>
	inline void StoreFinalOutput(NEONFragmentUint8x16x1 value, DstType* dst,
	int row, int col) {
	vst1q_u8(dst->data(row, col), value);
	}

	// Specialization of StoreFinalOutput for NEONFragmentInt32x4x1, storing into a
	// int32 destination.
	template <typename DstType>
	inline void StoreFinalOutput(NEONFragmentInt32x4x1 value, DstType* dst, int row,
	int col) {
	vst1q_s32(dst->data(row, col), value);
	}

	// Specialization of StoreFinalOutput for NEONFragmentInt32x16x1, storing into
	// a int32 destination.
	template <typename DstType>
	inline void StoreFinalOutput(NEONFragmentInt32x16x1 value, DstType* dst,
	int row, int col) {
	for (int i = 0; i < 4; i++) {
	vst1q_s32(dst->data(row + 4 * i, col), value.data.val[i]);
	}
	}

	} // namespace gemmlowp

	#endif // GEMMLOWP_INTERNAL_OUTPUT_NEON_H_