caffe2/operators/quantized/int8_simd.h - platform/external/pytorch - Git at Google

 #pragma once

 // We want to allow 128-bit wide SIMD if either NEON is available (as
 // detected by GEMMLOWP_NEON), or whether SSE4.2 and Clang is
 // available (in which case we will use the neon_sse.h library to
 // share source between the two implementations). We use SSE4.2 to
 // ensure we can use the full neon2sse library, and we use Clang as
 // GCC has issues correctly compiling some parts of the neon2sse
 // library.

 // Otherwise, the INT8_NEON_SIMD variable will be undefined.

 #include "gemmlowp/fixedpoint/fixedpoint.h"
 #include "gemmlowp/public/gemmlowp.h"

 #ifdef GEMMLOWP_NEON
 #define INT8_NEON_SIMD
 #endif

 #if defined(__SSE4_2__) && defined(__clang__)
 #define INT8_NEON_SIMD

 #include "NEON_2_SSE.h"
 // Add GEMMLOWP SIMD type wrappers for the NEON2SSE SIMD types.

 namespace gemmlowp {
 template <>
 struct FixedPointRawTypeTraits<int32x4_t> {
   typedef std::int32_t ScalarRawType;
   static const int kLanes = 4;
 };

 template <>
 inline int32x4_t Dup<int32x4_t>(std::int32_t x) {
   return vdupq_n_s32(x);
 }

 template <>
 inline int32x4_t BitAnd(int32x4_t a, int32x4_t b) {
   return vandq_s32(a, b);
 }

 template <>
 inline int32x4_t Add(int32x4_t a, int32x4_t b) {
   return vaddq_s32(a, b);
 }

 template <>
 inline int32x4_t ShiftRight(int32x4_t a, int offset) {
   return vshlq_s32(a, vdupq_n_s32(-offset));
 }

 template <>
 inline int32x4_t MaskIfLessThan(int32x4_t a, int32x4_t b) {
   return vreinterpretq_s32_u32(vcltq_s32(a, b));
 }

 template <>
 inline int32x4_t MaskIfGreaterThan(int32x4_t a, int32x4_t b) {
   return vreinterpretq_s32_u32(vcgtq_s32(a, b));
 }

 template <>
 inline int32x4_t BitNot(int32x4_t a) {
   return veorq_s32(a, vdupq_n_s32(-1));
 }

 template <>
 inline int32x4_t ShiftLeft(int32x4_t a, int offset) {
   return vshlq_s32(a, vdupq_n_s32(offset));
 }

 template <>
 inline int32x4_t MaskIfZero(int32x4_t a) {
   return MaskIfEqual(a, vdupq_n_s32(0));
 }

 template <>
 inline int32x4_t MaskIfNonZero(int32x4_t a) {
   return vreinterpretq_s32_u32(vtstq_s32(a, a));
 }

 template <>
 inline int32x4_t SaturatingRoundingDoublingHighMul(int32x4_t a, int32x4_t b) {
   return vqrdmulhq_s32(a, b);
 }

 template <>
 inline int32x4_t RoundingHalfSum(int32x4_t a, int32x4_t b) {
   return vrhaddq_s32(a, b);
 }

 } // namespace gemmlowp
 #endif
	#pragma once

	// We want to allow 128-bit wide SIMD if either NEON is available (as
	// detected by GEMMLOWP_NEON), or whether SSE4.2 and Clang is
	// available (in which case we will use the neon_sse.h library to
	// share source between the two implementations). We use SSE4.2 to
	// ensure we can use the full neon2sse library, and we use Clang as
	// GCC has issues correctly compiling some parts of the neon2sse
	// library.

	// Otherwise, the INT8_NEON_SIMD variable will be undefined.

	#include "gemmlowp/fixedpoint/fixedpoint.h"
	#include "gemmlowp/public/gemmlowp.h"

	#ifdef GEMMLOWP_NEON
	#define INT8_NEON_SIMD
	#endif

	#if defined(__SSE4_2__) && defined(__clang__)
	#define INT8_NEON_SIMD

	#include "NEON_2_SSE.h"
	// Add GEMMLOWP SIMD type wrappers for the NEON2SSE SIMD types.

	namespace gemmlowp {
	template <>
	struct FixedPointRawTypeTraits<int32x4_t> {
	typedef std::int32_t ScalarRawType;
	static const int kLanes = 4;
	};

	template <>
	inline int32x4_t Dup<int32x4_t>(std::int32_t x) {
	return vdupq_n_s32(x);
	}

	template <>
	inline int32x4_t BitAnd(int32x4_t a, int32x4_t b) {
	return vandq_s32(a, b);
	}

	template <>
	inline int32x4_t Add(int32x4_t a, int32x4_t b) {
	return vaddq_s32(a, b);
	}

	template <>
	inline int32x4_t ShiftRight(int32x4_t a, int offset) {
	return vshlq_s32(a, vdupq_n_s32(-offset));
	}

	template <>
	inline int32x4_t MaskIfLessThan(int32x4_t a, int32x4_t b) {
	return vreinterpretq_s32_u32(vcltq_s32(a, b));
	}

	template <>
	inline int32x4_t MaskIfGreaterThan(int32x4_t a, int32x4_t b) {
	return vreinterpretq_s32_u32(vcgtq_s32(a, b));
	}

	template <>
	inline int32x4_t BitNot(int32x4_t a) {
	return veorq_s32(a, vdupq_n_s32(-1));
	}

	template <>
	inline int32x4_t ShiftLeft(int32x4_t a, int offset) {
	return vshlq_s32(a, vdupq_n_s32(offset));
	}

	template <>
	inline int32x4_t MaskIfZero(int32x4_t a) {
	return MaskIfEqual(a, vdupq_n_s32(0));
	}

	template <>
	inline int32x4_t MaskIfNonZero(int32x4_t a) {
	return vreinterpretq_s32_u32(vtstq_s32(a, a));
	}

	template <>
	inline int32x4_t SaturatingRoundingDoublingHighMul(int32x4_t a, int32x4_t b) {
	return vqrdmulhq_s32(a, b);
	}

	template <>
	inline int32x4_t RoundingHalfSum(int32x4_t a, int32x4_t b) {
	return vrhaddq_s32(a, b);
	}

	} // namespace gemmlowp
	#endif