internal/output.h - platform/external/gemmlowp - Git at Google

 // Copyright 2015 Google Inc. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 // output.h: processing the 32-bit accumulators output by the unpack
 // stage, obtaining the final result matrix entries and storing them into
 // the destination matrix.

 #ifndef GEMMLOWP_INTERNAL_OUTPUT_H_
 #define GEMMLOWP_INTERNAL_OUTPUT_H_

 #include <cmath>
 #include <tuple>
 #include <type_traits>

 #include "../public/output_stages.h"
 #include "fixedpoint.h"

 namespace gemmlowp {

 // A Fragment is a small fixed-size matrix typically stored in one or
 // a few architecture-specific SIMD vectors. Besides plain old scalar types
 // such as int32_t, Fragment types are what can be used as input/output data
 // types for output pipeline stages.
 //
 // More details:
 //
 // In the generic scalar code in this file, we have only implemented
 // evaluation of output stages for scalar inputs (e.g. plain int32_t values).
 // Other files (e.g. output_neon.h) are to provide SIMD paths by implementing
 // evaluation of output stages for SIMD vector types. However, this raises
 // the question of how the different values ("lanes") in a SIMD vector
 // correspond to different values in the whole matrices. For simple entry-wise
 // output stages, this doesn't matter, but for other output stages depending
 // on position within the whole matrix, this does matter. To solve this
 // problem, rather than implementing evaluation of output stages for raw
 // SIMD vector types, we wrap SIMD vector types in "fragment" structs that
 // bring the additional structure of "shape" i.e. mapping SIMD lanes to
 // matrix entries, and we specialize evaluation of output stage for such
 // fragment types. The Fragment template struct here is how we generate
 // all fragment structs. For example, in output_neon.h, it may be specialized
 // with DataType=int32x4_t, Rows=4, Cols=1. MapOrder doesn't matter for
 // vector shapes. While Fragment is only used for SIMD paths, we leave it
 // here in this platform-generic file because this same template should
 // cover the needs of any SIMD architectures.
 template <typename tDataType, int tRows, int tCols, MapOrder tOrder>
 struct Fragment {
   typedef tDataType DataType;
   static const int kRows = tRows;
   static const int kCols = tCols;
   static const MapOrder kOrder = tOrder;

   Fragment() {}
   Fragment(const DataType& d) : data(d) {}
   operator DataType() const { return data; }

   DataType data;
 };

 typedef Fragment<std::int32_t, 1, 1, MapOrder::ColMajor> FragmentInt32x1x1;
 typedef Fragment<std::uint8_t, 1, 1, MapOrder::ColMajor> FragmentUint8x1x1;

 // OutputStageEvalImpl is the template that we specialize to provide
 // implementations of each output stage for each type of input data.
 //
 // Each specialization provides a OutputType typedef and an Eval function
 // returning OutputType. The OutputType typically depends on the InputType.
 //
 // There are two dimensions in which input data types can vary:
 //   1. Different output stages may expect different data types. The
 //      only hard constraint is that the first stage accepts int32, as
 //      the unpack stage produces int32 accumulators.
 //   2. For a given scalar data type such as int32, there is still the
 //      possibility of having SIMD vector types such as NEON int32x4_t,
 //      typically wrapped as "fragment" types, see struct Fragment.
 //      Thus, there can be several OutputStageEvalImpl
 //      specializations for a single OutputStageType, for different
 //      InputType's.
 template <typename OutputStageType, typename InputType>
 struct OutputStageEvalImpl {
   // This generic template body should never be hit.
   static_assert(
       std::is_same<InputType, void>::value,
       "Unimplemented: missing implementation of this output pipeline stage "
       "for this data type. This would happen if some architecture-specific "
       "SIMD back-end (output_$arch.h) were incomplete.");

   OutputStageEvalImpl(const OutputStageType&) {}
 };

 // Implementation of OutputStageQuantizeDownInt32ToUint8Scale for scalar data
 template <>
 struct OutputStageEvalImpl<OutputStageQuantizeDownInt32ToUint8Scale,
                            FragmentInt32x1x1> {
   typedef FragmentInt32x1x1 InputType;
   typedef FragmentInt32x1x1 OutputType;
   typedef OutputStageQuantizeDownInt32ToUint8Scale OutputStage;

   OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}

   OutputType Eval(InputType input, int, int) const {
     const std::int32_t result_shift = output_stage.result_shift;
     const std::int32_t result_mult_int = output_stage.result_mult_int;
     const std::int32_t result_offset = output_stage.result_offset;
     const std::int32_t kRoundingTerm =
         (result_shift < 1) ? 0 : (1 << (result_shift - 1));
     return ((input + result_offset) * result_mult_int + kRoundingTerm) >>
            result_shift;
   }

   const OutputStage& output_stage;
 };

 template <>
 struct OutputStageEvalImpl<
     OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Col>,
     FragmentInt32x1x1> {
   typedef FragmentInt32x1x1 InputType;
   typedef FragmentInt32x1x1 OutputType;
   typedef OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Col>
       OutputStage;

   OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}

   OutputType Eval(InputType input, int row, int col) const {
     const std::int32_t result_shift = output_stage.result_shift;
     const std::int32_t result_mult_int = output_stage.result_mult_int(row);
     const std::int32_t result_offset = output_stage.result_offset(row);
     const std::int32_t kRoundingTerm =
         (result_shift < 1) ? 0 : (1 << (result_shift - 1));
     return ((input + result_offset) * result_mult_int + kRoundingTerm) >>
            result_shift;
   }

   const OutputStage& output_stage;
 };

 template <>
 struct OutputStageEvalImpl<
     OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Row>,
     FragmentInt32x1x1> {
   typedef FragmentInt32x1x1 InputType;
   typedef FragmentInt32x1x1 OutputType;
   typedef OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Row>
       OutputStage;

   OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}

   OutputType Eval(InputType input, int row, int col) const {
     const std::int32_t result_shift = output_stage.result_shift;
     const std::int32_t result_mult_int = output_stage.result_mult_int(col);
     const std::int32_t result_offset = output_stage.result_offset(col);
     const std::int32_t kRoundingTerm =
         (result_shift < 1) ? 0 : (1 << (result_shift - 1));
     return ((input + result_offset) * result_mult_int + kRoundingTerm) >>
            result_shift;
   }

   const OutputStage& output_stage;
 };

 // Implementation of OutputStageSaturatingCastToUint8 for scalar data
 template <>
 struct OutputStageEvalImpl<OutputStageSaturatingCastToUint8,
                            FragmentInt32x1x1> {
   typedef FragmentInt32x1x1 InputType;
   typedef FragmentUint8x1x1 OutputType;
   typedef OutputStageSaturatingCastToUint8 OutputStage;

   OutputStageEvalImpl(const OutputStage&) {}

   OutputType Eval(InputType input, int, int) const {
     std::int32_t data = input.data;
     return data > 255 ? 255 : data < 0 ? 0 : data;
   }
 };

 // Implementation of OutputStageBiasAddition for scalar data
 template <typename VectorType>
 struct OutputStageEvalImpl<OutputStageBiasAddition<VectorType>,
                            FragmentInt32x1x1> {
   typedef FragmentInt32x1x1 InputType;
   typedef FragmentInt32x1x1 OutputType;
   typedef OutputStageBiasAddition<VectorType> OutputStage;

   OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}

   OutputType Eval(InputType input, int row, int col) const {
     if (VectorType::kShape == VectorShape::Row) {
       return input + output_stage.bias_vector(col);
     } else {
       return input + output_stage.bias_vector(row);
     }
   }

   const OutputStage& output_stage;
 };

 // Implementation of OutputStageClamp for scalar data
 template <>
 struct OutputStageEvalImpl<OutputStageClamp, FragmentInt32x1x1> {
   typedef FragmentInt32x1x1 InputType;
   typedef FragmentInt32x1x1 OutputType;
   typedef OutputStageClamp OutputStage;

   OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}

   OutputType Eval(InputType input, int, int) const {
     const std::int32_t min = output_stage.min;
     const std::int32_t max = output_stage.max;
     return std::min(std::max(input.data, min), max);
   }

   const OutputStage& output_stage;
 };

 // Implementation of OutputStageTanh for either scalar or SIMD data
 template <typename tInputType>
 struct OutputStageTanhEvalImpl {
   typedef tInputType InputType;
   typedef InputType OutputType;
   typedef typename InputType::DataType DataType;
   typedef OutputStageTanh OutputStage;

   OutputStageTanhEvalImpl(const OutputStage& s) : output_stage(s) {
     const std::int32_t real_zero_as_int32 = output_stage.real_zero_as_int32;
     const std::int32_t real_amplitude_as_int32 =
         output_stage.real_amplitude_as_int32;

     input_cutoff_min = real_zero_as_int32 - 8 * real_amplitude_as_int32;
     input_cutoff_max = real_zero_as_int32 + 8 * real_amplitude_as_int32;
     output_min = real_zero_as_int32 - real_amplitude_as_int32;
     output_max = real_zero_as_int32 + real_amplitude_as_int32;

     double inverse_amplitude_normalized_double = 1.0 / real_amplitude_as_int32;
     inverse_amplitude_neg_exponent = 0;
     while (inverse_amplitude_normalized_double < 0.5) {
       inverse_amplitude_normalized_double *= 2;
       inverse_amplitude_neg_exponent++;
     }
     inverse_amplitude_normalized =
         ToFixedPoint<DataType, 0>(inverse_amplitude_normalized_double);

     double amplitude_normalized_double = real_amplitude_as_int32;
     amplitude_exponent = 0;
     while (amplitude_normalized_double >= 1.0) {
       amplitude_normalized_double *= 0.5;
       amplitude_exponent++;
     }
     amplitude_normalized =
         ToFixedPoint<DataType, 0>(amplitude_normalized_double);
   }

   OutputType Eval(InputType input, int, int) const {
     const std::int32_t real_zero_as_int32 = output_stage.real_zero_as_int32;

     typedef FixedPoint<DataType, 3> F3;
     typedef FixedPoint<DataType, 0> F0;

     // fixed-point affine transformation
     DataType input_centered =
         Sub(input.data, Dup<DataType>(real_zero_as_int32));
     F3 fixedpoint_input =
         F3::FromRaw(input_centered) * inverse_amplitude_normalized;
     // left shift
     fixedpoint_input.raw() =
         ShiftLeft(fixedpoint_input.raw(), 28 - inverse_amplitude_neg_exponent);
     // fixed-point tanh and multiplication
     F0 fixedpoint_output = tanh(fixedpoint_input) * amplitude_normalized;
     // right shift
     DataType int32_output =
         Add(Dup<DataType>(real_zero_as_int32),
             ShiftRight(fixedpoint_output.raw(), 31 - amplitude_exponent));

     DataType mask_if_below_cutoff_min =
         MaskIfLessThanOrEqual(input.data, Dup<DataType>(input_cutoff_min));
     DataType mask_if_above_cutoff_max =
         MaskIfGreaterThanOrEqual(input.data, Dup<DataType>(input_cutoff_max));

     return SelectUsingMask(
         mask_if_below_cutoff_min, Dup<DataType>(output_min),
         SelectUsingMask(mask_if_above_cutoff_max, Dup<DataType>(output_max),
                         int32_output));
   }

   const OutputStage& output_stage;
   std::int32_t input_cutoff_min, input_cutoff_max;
   std::int32_t output_min, output_max;
   FixedPoint<DataType, 0> inverse_amplitude_normalized;
   int inverse_amplitude_neg_exponent;
   FixedPoint<DataType, 0> amplitude_normalized;
   int amplitude_exponent;
 };

 template <>
 struct OutputStageEvalImpl<OutputStageTanh, FragmentInt32x1x1>
     : OutputStageTanhEvalImpl<FragmentInt32x1x1> {
   OutputStageEvalImpl(const OutputStageTanh& output_stage)
       : OutputStageTanhEvalImpl(output_stage) {}
 };

 // OutputPipelineOutputType is a helper to determine the output data type of a
 // pipeline, for a
 // given input data type. It is a recursive template; see the explanation on
 // OutputPipelineEvalImpl below.
 template <typename OutputPipelineType, int FirstStage, typename InputType,
           bool StopRecursion =
               FirstStage == std::tuple_size<OutputPipelineType>::value>
 struct OutputPipelineOutputType {
   typedef typename std::tuple_element<FirstStage, OutputPipelineType>::type
       FirstStageType;
   typedef typename OutputStageEvalImpl<FirstStageType, InputType>::OutputType
       FirstStageOutputType;
   typedef typename OutputPipelineOutputType<OutputPipelineType, FirstStage + 1,
                                             FirstStageOutputType>::Type Type;
 };

 template <typename OutputPipelineType, int FirstStage, typename InputType>
 struct OutputPipelineOutputType<OutputPipelineType, FirstStage, InputType,
                                 true> {
   typedef InputType Type;
 };

 // OutputPipelineEvalImpl is a helper to implement the evaluation of
 // the whole pipeline. It is a recursive template to implement compile-time
 // unrolling of the loop over all pipeline stages. The 'FirstStage' parameter
 // is how we implement recursion: each specialization implements only
 // evaluation starting at 'FirstStage'. The StopRecursion parameter is just a
 // helper to implement the termination of the recursion as a partial
 // specialization below.
 template <typename OutputPipelineType, int FirstStage, typename InputType,
           bool StopRecursion =
               FirstStage == std::tuple_size<OutputPipelineType>::value>
 struct OutputPipelineEvalImpl {
   typedef typename std::tuple_element<FirstStage, OutputPipelineType>::type
       FirstStageType;
   typedef typename OutputStageEvalImpl<FirstStageType, InputType>::OutputType
       FirstStageOutputType;
   typedef typename OutputPipelineOutputType<OutputPipelineType, FirstStage,
                                             InputType>::Type OutputType;

   OutputPipelineEvalImpl(const OutputPipelineType& output_pipeline)
       : head_impl(std::get<FirstStage>(output_pipeline)),
         tail_impl(output_pipeline) {}

   OutputType Eval(InputType input, int row, int col) const {
     // Evaluate the first stage.
     FirstStageOutputType first_stage_output = head_impl.Eval(input, row, col);
     // Recurse into the remaining stages.
     return tail_impl.Eval(first_stage_output, row, col);
   }

   const OutputStageEvalImpl<FirstStageType, InputType> head_impl;
   const OutputPipelineEvalImpl<OutputPipelineType, FirstStage + 1,
                                FirstStageOutputType>
       tail_impl;
 };

 // Specialization on 'StopRecursion' for terminating the recursion.
 template <typename OutputPipelineType, int FirstStage, typename InputType>
 struct OutputPipelineEvalImpl<OutputPipelineType, FirstStage, InputType, true> {
   OutputPipelineEvalImpl(const OutputPipelineType&) {}

   InputType Eval(InputType input, int, int) const {
     // Terminating the recursion.
     return input;
   }
 };

 // StoreFinalOutput takes the final value at the end of the output pipeline and
 // stores it into the destination matrix. It can be specialized for different
 // data types; the generic implementation here is typically used only for plain
 // old scalar (not SIMD) types.
 template <typename OutputType, typename DstType>
 void StoreFinalOutput(OutputType value, DstType* dst, int row, int col) {
   *dst->data(row, col) = value;
 }

 template <typename OutputPipelineType, typename InputType>
 struct OutputPipelineExecutor {
   OutputPipelineExecutor(const OutputPipelineType& output_pipeline)
       : output_pipeline_eval_impl_(output_pipeline) {}

   // RunOutputPipeline is the entry point into the output pipeline evaluation
   // code. It should be the only thing that unpack code calls. It takes the
   // result
   // of the unpack stage and stores it into the destination matrix.
   template <typename DstType>
   void Execute(InputType input, DstType* dst, int row, int col) {
     // Statically assert that the output pipeline matches the given destination
     // matrix's scalar type.
     typedef typename OutputPipelineOutputType<OutputPipelineType, 0,
                                               FragmentInt32x1x1>::Type::DataType
         ScalarOutputType;
     typedef typename DstType::Scalar ScalarDstType;
     static_assert(std::is_same<ScalarOutputType, ScalarDstType>::value,
                   "mismatched destination scalar type and output pipeline");

     // Evaluate the output pipeline.
     auto output = output_pipeline_eval_impl_.Eval(input, row, col);
     // Store the result into the destination matrix.
     StoreFinalOutput(output, dst, row, col);
   }

   const OutputPipelineEvalImpl<OutputPipelineType, 0, InputType>
       output_pipeline_eval_impl_;
 };

 }  // namespace gemmlowp

 #endif  // GEMMLOWP_INTERNAL_OUTPUT_H_
	// Copyright 2015 Google Inc. All Rights Reserved.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	// output.h: processing the 32-bit accumulators output by the unpack
	// stage, obtaining the final result matrix entries and storing them into
	// the destination matrix.

	#ifndef GEMMLOWP_INTERNAL_OUTPUT_H_
	#define GEMMLOWP_INTERNAL_OUTPUT_H_

	#include <cmath>
	#include <tuple>
	#include <type_traits>

	#include "../public/output_stages.h"
	#include "fixedpoint.h"

	namespace gemmlowp {

	// A Fragment is a small fixed-size matrix typically stored in one or
	// a few architecture-specific SIMD vectors. Besides plain old scalar types
	// such as int32_t, Fragment types are what can be used as input/output data
	// types for output pipeline stages.
	//
	// More details:
	//
	// In the generic scalar code in this file, we have only implemented
	// evaluation of output stages for scalar inputs (e.g. plain int32_t values).
	// Other files (e.g. output_neon.h) are to provide SIMD paths by implementing
	// evaluation of output stages for SIMD vector types. However, this raises
	// the question of how the different values ("lanes") in a SIMD vector
	// correspond to different values in the whole matrices. For simple entry-wise
	// output stages, this doesn't matter, but for other output stages depending
	// on position within the whole matrix, this does matter. To solve this
	// problem, rather than implementing evaluation of output stages for raw
	// SIMD vector types, we wrap SIMD vector types in "fragment" structs that
	// bring the additional structure of "shape" i.e. mapping SIMD lanes to
	// matrix entries, and we specialize evaluation of output stage for such
	// fragment types. The Fragment template struct here is how we generate
	// all fragment structs. For example, in output_neon.h, it may be specialized
	// with DataType=int32x4_t, Rows=4, Cols=1. MapOrder doesn't matter for
	// vector shapes. While Fragment is only used for SIMD paths, we leave it
	// here in this platform-generic file because this same template should
	// cover the needs of any SIMD architectures.
	template <typename tDataType, int tRows, int tCols, MapOrder tOrder>
	struct Fragment {
	typedef tDataType DataType;
	static const int kRows = tRows;
	static const int kCols = tCols;
	static const MapOrder kOrder = tOrder;

	Fragment() {}
	Fragment(const DataType& d) : data(d) {}
	operator DataType() const { return data; }

	DataType data;
	};

	typedef Fragment<std::int32_t, 1, 1, MapOrder::ColMajor> FragmentInt32x1x1;
	typedef Fragment<std::uint8_t, 1, 1, MapOrder::ColMajor> FragmentUint8x1x1;

	// OutputStageEvalImpl is the template that we specialize to provide
	// implementations of each output stage for each type of input data.
	//
	// Each specialization provides a OutputType typedef and an Eval function
	// returning OutputType. The OutputType typically depends on the InputType.
	//
	// There are two dimensions in which input data types can vary:
	// 1. Different output stages may expect different data types. The
	// only hard constraint is that the first stage accepts int32, as
	// the unpack stage produces int32 accumulators.
	// 2. For a given scalar data type such as int32, there is still the
	// possibility of having SIMD vector types such as NEON int32x4_t,
	// typically wrapped as "fragment" types, see struct Fragment.
	// Thus, there can be several OutputStageEvalImpl
	// specializations for a single OutputStageType, for different
	// InputType's.
	template <typename OutputStageType, typename InputType>
	struct OutputStageEvalImpl {
	// This generic template body should never be hit.
	static_assert(
	std::is_same<InputType, void>::value,
	"Unimplemented: missing implementation of this output pipeline stage "
	"for this data type. This would happen if some architecture-specific "
	"SIMD back-end (output_$arch.h) were incomplete.");

	OutputStageEvalImpl(const OutputStageType&) {}
	};

	// Implementation of OutputStageQuantizeDownInt32ToUint8Scale for scalar data
	template <>
	struct OutputStageEvalImpl<OutputStageQuantizeDownInt32ToUint8Scale,
	FragmentInt32x1x1> {
	typedef FragmentInt32x1x1 InputType;
	typedef FragmentInt32x1x1 OutputType;
	typedef OutputStageQuantizeDownInt32ToUint8Scale OutputStage;

	OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}

	OutputType Eval(InputType input, int, int) const {
	const std::int32_t result_shift = output_stage.result_shift;
	const std::int32_t result_mult_int = output_stage.result_mult_int;
	const std::int32_t result_offset = output_stage.result_offset;
	const std::int32_t kRoundingTerm =
	(result_shift < 1) ? 0 : (1 << (result_shift - 1));
	return ((input + result_offset) * result_mult_int + kRoundingTerm) >>
	result_shift;
	}

	const OutputStage& output_stage;
	};

	template <>
	struct OutputStageEvalImpl<
	OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Col>,
	FragmentInt32x1x1> {
	typedef FragmentInt32x1x1 InputType;
	typedef FragmentInt32x1x1 OutputType;
	typedef OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Col>
	OutputStage;

	OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}

	OutputType Eval(InputType input, int row, int col) const {
	const std::int32_t result_shift = output_stage.result_shift;
	const std::int32_t result_mult_int = output_stage.result_mult_int(row);
	const std::int32_t result_offset = output_stage.result_offset(row);
	const std::int32_t kRoundingTerm =
	(result_shift < 1) ? 0 : (1 << (result_shift - 1));
	return ((input + result_offset) * result_mult_int + kRoundingTerm) >>
	result_shift;
	}

	const OutputStage& output_stage;
	};

	template <>
	struct OutputStageEvalImpl<
	OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Row>,
	FragmentInt32x1x1> {
	typedef FragmentInt32x1x1 InputType;
	typedef FragmentInt32x1x1 OutputType;
	typedef OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Row>
	OutputStage;

	OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}

	OutputType Eval(InputType input, int row, int col) const {
	const std::int32_t result_shift = output_stage.result_shift;
	const std::int32_t result_mult_int = output_stage.result_mult_int(col);
	const std::int32_t result_offset = output_stage.result_offset(col);
	const std::int32_t kRoundingTerm =
	(result_shift < 1) ? 0 : (1 << (result_shift - 1));
	return ((input + result_offset) * result_mult_int + kRoundingTerm) >>
	result_shift;
	}

	const OutputStage& output_stage;
	};

	// Implementation of OutputStageSaturatingCastToUint8 for scalar data
	template <>
	struct OutputStageEvalImpl<OutputStageSaturatingCastToUint8,
	FragmentInt32x1x1> {
	typedef FragmentInt32x1x1 InputType;
	typedef FragmentUint8x1x1 OutputType;
	typedef OutputStageSaturatingCastToUint8 OutputStage;

	OutputStageEvalImpl(const OutputStage&) {}

	OutputType Eval(InputType input, int, int) const {
	std::int32_t data = input.data;
	return data > 255 ? 255 : data < 0 ? 0 : data;
	}
	};

	// Implementation of OutputStageBiasAddition for scalar data
	template <typename VectorType>
	struct OutputStageEvalImpl<OutputStageBiasAddition<VectorType>,
	FragmentInt32x1x1> {
	typedef FragmentInt32x1x1 InputType;
	typedef FragmentInt32x1x1 OutputType;
	typedef OutputStageBiasAddition<VectorType> OutputStage;

	OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}

	OutputType Eval(InputType input, int row, int col) const {
	if (VectorType::kShape == VectorShape::Row) {
	return input + output_stage.bias_vector(col);
	} else {
	return input + output_stage.bias_vector(row);
	}
	}

	const OutputStage& output_stage;
	};

	// Implementation of OutputStageClamp for scalar data
	template <>
	struct OutputStageEvalImpl<OutputStageClamp, FragmentInt32x1x1> {
	typedef FragmentInt32x1x1 InputType;
	typedef FragmentInt32x1x1 OutputType;
	typedef OutputStageClamp OutputStage;

	OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}

	OutputType Eval(InputType input, int, int) const {
	const std::int32_t min = output_stage.min;
	const std::int32_t max = output_stage.max;
	return std::min(std::max(input.data, min), max);
	}

	const OutputStage& output_stage;
	};

	// Implementation of OutputStageTanh for either scalar or SIMD data
	template <typename tInputType>
	struct OutputStageTanhEvalImpl {
	typedef tInputType InputType;
	typedef InputType OutputType;
	typedef typename InputType::DataType DataType;
	typedef OutputStageTanh OutputStage;

	OutputStageTanhEvalImpl(const OutputStage& s) : output_stage(s) {
	const std::int32_t real_zero_as_int32 = output_stage.real_zero_as_int32;
	const std::int32_t real_amplitude_as_int32 =
	output_stage.real_amplitude_as_int32;

	input_cutoff_min = real_zero_as_int32 - 8 * real_amplitude_as_int32;
	input_cutoff_max = real_zero_as_int32 + 8 * real_amplitude_as_int32;
	output_min = real_zero_as_int32 - real_amplitude_as_int32;
	output_max = real_zero_as_int32 + real_amplitude_as_int32;

	double inverse_amplitude_normalized_double = 1.0 / real_amplitude_as_int32;
	inverse_amplitude_neg_exponent = 0;
	while (inverse_amplitude_normalized_double < 0.5) {
	inverse_amplitude_normalized_double *= 2;
	inverse_amplitude_neg_exponent++;
	}
	inverse_amplitude_normalized =
	ToFixedPoint<DataType, 0>(inverse_amplitude_normalized_double);

	double amplitude_normalized_double = real_amplitude_as_int32;
	amplitude_exponent = 0;
	while (amplitude_normalized_double >= 1.0) {
	amplitude_normalized_double *= 0.5;
	amplitude_exponent++;
	}
	amplitude_normalized =
	ToFixedPoint<DataType, 0>(amplitude_normalized_double);
	}

	OutputType Eval(InputType input, int, int) const {
	const std::int32_t real_zero_as_int32 = output_stage.real_zero_as_int32;

	typedef FixedPoint<DataType, 3> F3;
	typedef FixedPoint<DataType, 0> F0;

	// fixed-point affine transformation
	DataType input_centered =
	Sub(input.data, Dup<DataType>(real_zero_as_int32));
	F3 fixedpoint_input =
	F3::FromRaw(input_centered) * inverse_amplitude_normalized;
	// left shift
	fixedpoint_input.raw() =
	ShiftLeft(fixedpoint_input.raw(), 28 - inverse_amplitude_neg_exponent);
	// fixed-point tanh and multiplication
	F0 fixedpoint_output = tanh(fixedpoint_input) * amplitude_normalized;
	// right shift
	DataType int32_output =
	Add(Dup<DataType>(real_zero_as_int32),
	ShiftRight(fixedpoint_output.raw(), 31 - amplitude_exponent));

	DataType mask_if_below_cutoff_min =
	MaskIfLessThanOrEqual(input.data, Dup<DataType>(input_cutoff_min));
	DataType mask_if_above_cutoff_max =
	MaskIfGreaterThanOrEqual(input.data, Dup<DataType>(input_cutoff_max));

	return SelectUsingMask(
	mask_if_below_cutoff_min, Dup<DataType>(output_min),
	SelectUsingMask(mask_if_above_cutoff_max, Dup<DataType>(output_max),
	int32_output));
	}

	const OutputStage& output_stage;
	std::int32_t input_cutoff_min, input_cutoff_max;
	std::int32_t output_min, output_max;
	FixedPoint<DataType, 0> inverse_amplitude_normalized;
	int inverse_amplitude_neg_exponent;
	FixedPoint<DataType, 0> amplitude_normalized;
	int amplitude_exponent;
	};

	template <>
	struct OutputStageEvalImpl<OutputStageTanh, FragmentInt32x1x1>
	: OutputStageTanhEvalImpl<FragmentInt32x1x1> {
	OutputStageEvalImpl(const OutputStageTanh& output_stage)
	: OutputStageTanhEvalImpl(output_stage) {}
	};

	// OutputPipelineOutputType is a helper to determine the output data type of a
	// pipeline, for a
	// given input data type. It is a recursive template; see the explanation on
	// OutputPipelineEvalImpl below.
	template <typename OutputPipelineType, int FirstStage, typename InputType,
	bool StopRecursion =
	FirstStage == std::tuple_size<OutputPipelineType>::value>
	struct OutputPipelineOutputType {
	typedef typename std::tuple_element<FirstStage, OutputPipelineType>::type
	FirstStageType;
	typedef typename OutputStageEvalImpl<FirstStageType, InputType>::OutputType
	FirstStageOutputType;
	typedef typename OutputPipelineOutputType<OutputPipelineType, FirstStage + 1,
	FirstStageOutputType>::Type Type;
	};

	template <typename OutputPipelineType, int FirstStage, typename InputType>
	struct OutputPipelineOutputType<OutputPipelineType, FirstStage, InputType,
	true> {
	typedef InputType Type;
	};

	// OutputPipelineEvalImpl is a helper to implement the evaluation of
	// the whole pipeline. It is a recursive template to implement compile-time
	// unrolling of the loop over all pipeline stages. The 'FirstStage' parameter
	// is how we implement recursion: each specialization implements only
	// evaluation starting at 'FirstStage'. The StopRecursion parameter is just a
	// helper to implement the termination of the recursion as a partial
	// specialization below.
	template <typename OutputPipelineType, int FirstStage, typename InputType,
	bool StopRecursion =
	FirstStage == std::tuple_size<OutputPipelineType>::value>
	struct OutputPipelineEvalImpl {
	typedef typename std::tuple_element<FirstStage, OutputPipelineType>::type
	FirstStageType;
	typedef typename OutputStageEvalImpl<FirstStageType, InputType>::OutputType
	FirstStageOutputType;
	typedef typename OutputPipelineOutputType<OutputPipelineType, FirstStage,
	InputType>::Type OutputType;

	OutputPipelineEvalImpl(const OutputPipelineType& output_pipeline)
	: head_impl(std::get<FirstStage>(output_pipeline)),
	tail_impl(output_pipeline) {}

	OutputType Eval(InputType input, int row, int col) const {
	// Evaluate the first stage.
	FirstStageOutputType first_stage_output = head_impl.Eval(input, row, col);
	// Recurse into the remaining stages.
	return tail_impl.Eval(first_stage_output, row, col);
	}

	const OutputStageEvalImpl<FirstStageType, InputType> head_impl;
	const OutputPipelineEvalImpl<OutputPipelineType, FirstStage + 1,
	FirstStageOutputType>
	tail_impl;
	};

	// Specialization on 'StopRecursion' for terminating the recursion.
	template <typename OutputPipelineType, int FirstStage, typename InputType>
	struct OutputPipelineEvalImpl<OutputPipelineType, FirstStage, InputType, true> {
	OutputPipelineEvalImpl(const OutputPipelineType&) {}

	InputType Eval(InputType input, int, int) const {
	// Terminating the recursion.
	return input;
	}
	};

	// StoreFinalOutput takes the final value at the end of the output pipeline and
	// stores it into the destination matrix. It can be specialized for different
	// data types; the generic implementation here is typically used only for plain
	// old scalar (not SIMD) types.
	template <typename OutputType, typename DstType>
	void StoreFinalOutput(OutputType value, DstType* dst, int row, int col) {
	*dst->data(row, col) = value;
	}

	template <typename OutputPipelineType, typename InputType>
	struct OutputPipelineExecutor {
	OutputPipelineExecutor(const OutputPipelineType& output_pipeline)
	: output_pipeline_eval_impl_(output_pipeline) {}

	// RunOutputPipeline is the entry point into the output pipeline evaluation
	// code. It should be the only thing that unpack code calls. It takes the
	// result
	// of the unpack stage and stores it into the destination matrix.
	template <typename DstType>
	void Execute(InputType input, DstType* dst, int row, int col) {
	// Statically assert that the output pipeline matches the given destination
	// matrix's scalar type.
	typedef typename OutputPipelineOutputType<OutputPipelineType, 0,
	FragmentInt32x1x1>::Type::DataType
	ScalarOutputType;
	typedef typename DstType::Scalar ScalarDstType;
	static_assert(std::is_same<ScalarOutputType, ScalarDstType>::value,
	"mismatched destination scalar type and output pipeline");

	// Evaluate the output pipeline.
	auto output = output_pipeline_eval_impl_.Eval(input, row, col);
	// Store the result into the destination matrix.
	StoreFinalOutput(output, dst, row, col);
	}

	const OutputPipelineEvalImpl<OutputPipelineType, 0, InputType>
	output_pipeline_eval_impl_;
	};

	} // namespace gemmlowp

	#endif // GEMMLOWP_INTERNAL_OUTPUT_H_