src/backends/reference/workloads/RefLstmFloat32Workload.cpp - platform/external/armnn - Git at Google

 //
 // Copyright © 2017 Arm Ltd. All rights reserved.
 // SPDX-License-Identifier: MIT
 //

 #include "RefLstmFloat32Workload.hpp"
 #include "RefWorkloadUtils.hpp"
 #include "Activation.hpp"

 namespace
 {

 // Helper functions ported from the Android code base
 // Refer to: android/external/tensorflow/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.cc

 void MatrixBatchVectorMultiplyAccumulate(const float* matrix,
                                          uint32_t mRows,
                                          uint32_t mCols,
                                          const float* vector,
                                          uint32_t nBatch,
                                          float* outResult,
                                          int resultStride = 1)
 {
     float* resultInBatch = outResult;
     for (uint32_t b = 0; b < nBatch; b++)
     {
         const float* matrixPtr = matrix;
         for (uint32_t r = 0; r < mRows; r++)
         {
             const float* vectorInBatch = vector + b * mCols;
             for (uint32_t c = 0; c < mCols; c++)
             {
                 *resultInBatch += *matrixPtr++ * *vectorInBatch++;
             }
             resultInBatch += resultStride;
         }
     }
 }

 void VectorBatchVectorAssign(const float* vector,
                              uint32_t vSize,
                              uint32_t nBatch,
                              float* outBatchVector)
 {
     for (uint32_t b = 0; b < nBatch; b++)
     {
         memcpy(outBatchVector + b * vSize, vector, vSize * sizeof(float));
     }
 }

 void VectorBatchVectorCwiseProductAccumulate(const float* vector,
                                              uint32_t vSize,
                                              const float* batchVector,
                                              uint32_t nBatch,
                                              float* outResult)
 {
     for (uint32_t b = 0; b < nBatch; b++)
     {
         for (uint32_t v = 0; v < vSize; v++)
         {
             *outResult++ += vector[v] * *batchVector++;
         }
     }
 }

 void Sub1Vector(const float* vector,
                 uint32_t vSize,
                 float* result)
 {
     for (uint32_t v = 0; v < vSize; v++)
     {
         *result++ = 1.0f - *vector++;
     }
 }

 void VectorVectorCwiseProduct(const float* vector1,
                               const float* vector2,
                               uint32_t vSize,
                               float* outResult)
 {
     for (uint32_t v = 0; v < vSize; v++)
     {
         *outResult++ = *vector1++ * *vector2++;
     }
 }

 void VectorVectorCwiseProductAccumulate(const float* vector1,
                                         const float* vector2,
                                         uint32_t vSize,
                                         float* outResult)
 {
     for (uint32_t v = 0; v < vSize; v++)
     {
         *outResult++ += *vector1++ * *vector2++;
     }
 }

 float Clip(float f,
            float absLimit)
 {
     float result = (absLimit < f) ? absLimit : f;
     result = (-absLimit > result) ? -absLimit : result;
     return result;
 }

 void ClipVector(const float* vector,
                 uint32_t vSize,
                 float absLimit,
                 float* outResult)
 {
     for (uint32_t v = 0; v < vSize; v++)
     {
         *outResult++ = Clip(*vector++, absLimit);
     }
 }

 void CopyVector(const float* vector,
                 uint32_t vSize,
                 float* outResult)
 {
     memcpy(outResult, vector, vSize * sizeof(float));
 }

 void SetActivationParameters(uint32_t activation,
                              armnn::ActivationFunction& outArmnnActivation,
                              float& outA,
                              float& outB)
 {
     switch (activation)
     {
     case 0: // None
         outA = 0;
         outB = 0;
         return;

     case 1: // Relu
         outArmnnActivation = armnn::ActivationFunction::ReLu;
         outA = 0;
         outB = 0;
         return;

     case 3: // Relu6
         outArmnnActivation = armnn::ActivationFunction::BoundedReLu;
         outA = 6;
         outB = 0;
         return;

     case 4: // Tanh
         outArmnnActivation = armnn::ActivationFunction::TanH;
         outA = 1;
         outB = 1;
         return;

     case 6: // Sigmoid
         outArmnnActivation = armnn::ActivationFunction::Sigmoid;
         outA = 0;
         outB = 0;
         return;

     default:
         throw armnn::Exception("Unsupported activation function: " + std::to_string(activation));
     }
 }

 std::unique_ptr<armnn::ScopedCpuTensorHandle> AssignScopedCpuTensorHandle(const armnn::ConstCpuTensorHandle* ptr)
 {
     if (!ptr)
     {
         return nullptr;
     }

     return std::make_unique<armnn::ScopedCpuTensorHandle>(*ptr);
 }

 } // anonymous namespace

 namespace armnn
 {

 RefLstmFloat32Workload::RefLstmFloat32Workload(const LstmQueueDescriptor &descriptor, const WorkloadInfo &info)
     : Float32Workload<LstmQueueDescriptor>(descriptor, info)
     , m_InputToInputWeightsTensor     (AssignScopedCpuTensorHandle(descriptor.m_InputToInputWeights))
     , m_InputToForgetWeightsTensor    (AssignScopedCpuTensorHandle(descriptor.m_InputToForgetWeights))
     , m_InputToCellWeightsTensor      (AssignScopedCpuTensorHandle(descriptor.m_InputToCellWeights))
     , m_InputToOutputWeightsTensor    (AssignScopedCpuTensorHandle(descriptor.m_InputToOutputWeights))
     , m_RecurrentToInputWeightsTensor (AssignScopedCpuTensorHandle(descriptor.m_RecurrentToInputWeights))
     , m_RecurrentToForgetWeightsTensor(AssignScopedCpuTensorHandle(descriptor.m_RecurrentToForgetWeights))
     , m_RecurrentToCellWeightsTensor  (AssignScopedCpuTensorHandle(descriptor.m_RecurrentToCellWeights))
     , m_RecurrentToOutputWeightsTensor(AssignScopedCpuTensorHandle(descriptor.m_RecurrentToOutputWeights))
     , m_CellToInputWeightsTensor      (AssignScopedCpuTensorHandle(descriptor.m_CellToInputWeights))
     , m_CellToForgetWeightsTensor     (AssignScopedCpuTensorHandle(descriptor.m_CellToForgetWeights))
     , m_CellToOutputWeightsTensor     (AssignScopedCpuTensorHandle(descriptor.m_CellToOutputWeights))
     , m_InputGateBiasTensor           (AssignScopedCpuTensorHandle(descriptor.m_InputGateBias))
     , m_ForgetGateBiasTensor          (AssignScopedCpuTensorHandle(descriptor.m_ForgetGateBias))
     , m_CellBiasTensor                (AssignScopedCpuTensorHandle(descriptor.m_CellBias))
     , m_OutputGateBiasTensor          (AssignScopedCpuTensorHandle(descriptor.m_OutputGateBias))
     , m_ProjectionWeightsTensor       (AssignScopedCpuTensorHandle(descriptor.m_ProjectionWeights))
     , m_ProjectionBiasTensor          (AssignScopedCpuTensorHandle(descriptor.m_ProjectionBias))
 {}

 void RefLstmFloat32Workload::Execute() const
 {
     // This is a porting of the LSTM::Eval() method in the Android code base
     // Refer to: android/frameworks/ml/nn/common/operations/LSTM.cpp

     const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]);
     const TensorShape& inputShape = inputInfo.GetShape();

     float* scratchBuffer  = GetOutputTensorDataFloat(0, m_Data);
     float* outputStateOut = GetOutputTensorDataFloat(1, m_Data);
     float* cellStateOut   = GetOutputTensorDataFloat(2, m_Data);
     float* output         = GetOutputTensorDataFloat(3, m_Data);

     const float* inputData     = GetInputTensorDataFloat(0, m_Data);
     const float* outputStateIn = GetInputTensorDataFloat(1, m_Data);
     const float* cellStateIn   = GetInputTensorDataFloat(2, m_Data);

     const uint32_t nBatch = inputShape[0];
     const uint32_t nInput = inputShape[1];

     const uint32_t nCell   = m_InputToOutputWeightsTensor->GetShape()[0];
     const uint32_t nOutput = m_RecurrentToOutputWeightsTensor->GetShape()[1];

     const bool useCifg     = m_Data.m_Parameters.m_CifgEnabled;
     const bool usePeephole = m_Data.m_Parameters.m_PeepholeEnabled;

     // Index the scratch buffers pointers to the global scratch buffer.
     float* inputGateScratch  = nullptr;
     float* cellScratch       = nullptr;
     float* forgetGateScratch = nullptr;
     float* outputGateScratch = nullptr;

     if (useCifg)
     {
         cellScratch       = scratchBuffer + 0 * nCell * nBatch;
         forgetGateScratch = scratchBuffer + 1 * nCell * nBatch;
         outputGateScratch = scratchBuffer + 2 * nCell * nBatch;
     }
     else
     {
         inputGateScratch  = scratchBuffer + 0 * nCell * nBatch;
         cellScratch       = scratchBuffer + 1 * nCell * nBatch;
         forgetGateScratch = scratchBuffer + 2 * nCell * nBatch;
         outputGateScratch = scratchBuffer + 3 * nCell * nBatch;
     }

     // Initialize scratch buffers with bias.
     if (!useCifg)
     {
         VectorBatchVectorAssign(m_InputGateBiasTensor->GetTensor<float>(),
                                 nCell, nBatch, inputGateScratch);
     }
     VectorBatchVectorAssign(m_ForgetGateBiasTensor->GetTensor<float>(),
                             nCell, nBatch, forgetGateScratch);
     VectorBatchVectorAssign(m_CellBiasTensor->GetTensor<float>(),
                             nCell, nBatch, cellScratch);
     VectorBatchVectorAssign(m_OutputGateBiasTensor->GetTensor<float>(),
                             nCell, nBatch, outputGateScratch);

     // For each batch and cell: compute input_weight * input.
     if (!useCifg)
     {
         MatrixBatchVectorMultiplyAccumulate(m_InputToInputWeightsTensor->GetTensor<float>(),
                                             nCell, nInput, inputData, nBatch, inputGateScratch);
     }
     MatrixBatchVectorMultiplyAccumulate(m_InputToForgetWeightsTensor->GetTensor<float>(),
                                         nCell, nInput, inputData, nBatch, forgetGateScratch);
     MatrixBatchVectorMultiplyAccumulate(m_InputToCellWeightsTensor->GetTensor<float>(),
                                         nCell, nInput, inputData, nBatch, cellScratch);
     MatrixBatchVectorMultiplyAccumulate(m_InputToOutputWeightsTensor->GetTensor<float>(),
                                         nCell, nInput, inputData, nBatch, outputGateScratch);

     // For each batch and cell: compute recurrent_weight * output_state.
     if (!useCifg)
     {
         MatrixBatchVectorMultiplyAccumulate(m_RecurrentToInputWeightsTensor->GetTensor<float>(),
                                             nCell, nOutput, outputStateIn, nBatch, inputGateScratch);
     }
     MatrixBatchVectorMultiplyAccumulate(m_RecurrentToForgetWeightsTensor->GetTensor<float>(),
                                         nCell, nOutput, outputStateIn, nBatch, forgetGateScratch);
     MatrixBatchVectorMultiplyAccumulate(m_RecurrentToCellWeightsTensor->GetTensor<float>(),
                                         nCell, nOutput, outputStateIn, nBatch, cellScratch);
     MatrixBatchVectorMultiplyAccumulate(m_RecurrentToOutputWeightsTensor->GetTensor<float>(),
                                         nCell, nOutput, outputStateIn, nBatch, outputGateScratch);

     // For each batch and cell: update input gate.
     if (!useCifg)
     {
         if (usePeephole)
         {
             VectorBatchVectorCwiseProductAccumulate(m_CellToInputWeightsTensor->GetTensor<float>(),
                                                     nCell, cellStateIn, nBatch, inputGateScratch);
         }
         Activation(inputGateScratch, inputGateScratch,
                    TensorInfo({nCell, nBatch}, DataType::Float32),
                    ActivationFunction::Sigmoid, 0, 0);
     }

     // For each batch and cell: update forget gate.
     if (usePeephole)
     {
         VectorBatchVectorCwiseProductAccumulate(m_CellToForgetWeightsTensor->GetTensor<float>(), nCell,
                                                 cellStateIn, nBatch, forgetGateScratch);
     }
     Activation(forgetGateScratch, forgetGateScratch,
                TensorInfo({nCell, nBatch}, DataType::Float32),
                ActivationFunction::Sigmoid, 0, 0);

     // For each batch and cell: update the cell.
     VectorVectorCwiseProduct(forgetGateScratch, cellStateIn, nBatch * nCell, cellStateOut);

     ActivationFunction armnnActivationFunc = ActivationFunction::Sigmoid;
     float a = 0;
     float b = 0;
     SetActivationParameters(m_Data.m_Parameters.m_ActivationFunc, armnnActivationFunc, a, b);

     if (m_Data.m_Parameters.m_ActivationFunc > 0)
     {
         Activation(cellScratch, cellScratch,
                    TensorInfo({nCell, nBatch}, DataType::Float32),
                    armnnActivationFunc, a, b);
     }
     if (useCifg)
     {
         Sub1Vector(forgetGateScratch, nBatch * nCell, forgetGateScratch);
         VectorVectorCwiseProductAccumulate(cellScratch, forgetGateScratch, nBatch * nCell, cellStateOut);
     }
     else
     {
         VectorVectorCwiseProductAccumulate(cellScratch, inputGateScratch, nBatch * nCell, cellStateOut);
     }
     if (m_Data.m_Parameters.m_ClippingThresCell > 0.0)
     {
         ClipVector(cellStateOut, nBatch * nCell, m_Data.m_Parameters.m_ClippingThresCell, cellStateOut);
     }

     // For each batch and cell: update the output gate.
     if (usePeephole)
     {
         VectorBatchVectorCwiseProductAccumulate(m_CellToOutputWeightsTensor->GetTensor<float>(),
                                                 nCell, cellStateOut, nBatch, outputGateScratch);
     }
     Activation(outputGateScratch, outputGateScratch,
                TensorInfo({nCell, nBatch}, DataType::Float32),
                ActivationFunction::Sigmoid, 0, 0);

     if (m_Data.m_Parameters.m_ActivationFunc > 0)
     {
         Activation(cellStateOut, cellScratch,
                    TensorInfo({nCell, nBatch}, DataType::Float32),
                    armnnActivationFunc, a, b);
     }
     VectorVectorCwiseProduct(outputGateScratch, cellScratch, nBatch * nCell, outputGateScratch);

     // For each batch: update the projection and output_state.
     if (m_Data.m_Parameters.m_ProjectionEnabled)
     {
         if (m_ProjectionBiasTensor)
         {
             VectorBatchVectorAssign(m_ProjectionBiasTensor->GetTensor<float>(),
                                     nOutput, nBatch, output);
         }
         MatrixBatchVectorMultiplyAccumulate(m_ProjectionWeightsTensor->GetTensor<float>(),
                                             nOutput, nCell, outputGateScratch, nBatch, output);

         if (m_Data.m_Parameters.m_ClippingThresProj > 0.0)
         {
             ClipVector(output, nBatch * nOutput, m_Data.m_Parameters.m_ClippingThresProj, output);
         }
     }
     else
     {
         CopyVector(outputGateScratch, nBatch * nOutput, output);
     }

     CopyVector(output, nBatch * nOutput, outputStateOut);
 }

 } //namespace armnn
	//
	// Copyright © 2017 Arm Ltd. All rights reserved.
	// SPDX-License-Identifier: MIT
	//

	#include "RefLstmFloat32Workload.hpp"
	#include "RefWorkloadUtils.hpp"
	#include "Activation.hpp"

	namespace
	{

	// Helper functions ported from the Android code base
	// Refer to: android/external/tensorflow/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.cc

	void MatrixBatchVectorMultiplyAccumulate(const float* matrix,
	uint32_t mRows,
	uint32_t mCols,
	const float* vector,
	uint32_t nBatch,
	float* outResult,
	int resultStride = 1)
	{
	float* resultInBatch = outResult;
	for (uint32_t b = 0; b < nBatch; b++)
	{
	const float* matrixPtr = matrix;
	for (uint32_t r = 0; r < mRows; r++)
	{
	const float* vectorInBatch = vector + b * mCols;
	for (uint32_t c = 0; c < mCols; c++)
	{
	resultInBatch += matrixPtr++ * *vectorInBatch++;
	}
	resultInBatch += resultStride;
	}
	}
	}

	void VectorBatchVectorAssign(const float* vector,
	uint32_t vSize,
	uint32_t nBatch,
	float* outBatchVector)
	{
	for (uint32_t b = 0; b < nBatch; b++)
	{
	memcpy(outBatchVector + b * vSize, vector, vSize * sizeof(float));
	}
	}

	void VectorBatchVectorCwiseProductAccumulate(const float* vector,
	uint32_t vSize,
	const float* batchVector,
	uint32_t nBatch,
	float* outResult)
	{
	for (uint32_t b = 0; b < nBatch; b++)
	{
	for (uint32_t v = 0; v < vSize; v++)
	{
	outResult++ += vector[v] *batchVector++;
	}
	}
	}

	void Sub1Vector(const float* vector,
	uint32_t vSize,
	float* result)
	{
	for (uint32_t v = 0; v < vSize; v++)
	{
	result++ = 1.0f - vector++;
	}
	}

	void VectorVectorCwiseProduct(const float* vector1,
	const float* vector2,
	uint32_t vSize,
	float* outResult)
	{
	for (uint32_t v = 0; v < vSize; v++)
	{
	outResult++ = vector1++ * *vector2++;
	}
	}

	void VectorVectorCwiseProductAccumulate(const float* vector1,
	const float* vector2,
	uint32_t vSize,
	float* outResult)
	{
	for (uint32_t v = 0; v < vSize; v++)
	{
	outResult++ += vector1++ * *vector2++;
	}
	}

	float Clip(float f,
	float absLimit)
	{
	float result = (absLimit < f) ? absLimit : f;
	result = (-absLimit > result) ? -absLimit : result;
	return result;
	}

	void ClipVector(const float* vector,
	uint32_t vSize,
	float absLimit,
	float* outResult)
	{
	for (uint32_t v = 0; v < vSize; v++)
	{
	outResult++ = Clip(vector++, absLimit);
	}
	}

	void CopyVector(const float* vector,
	uint32_t vSize,
	float* outResult)
	{
	memcpy(outResult, vector, vSize * sizeof(float));
	}

	void SetActivationParameters(uint32_t activation,
	armnn::ActivationFunction& outArmnnActivation,
	float& outA,
	float& outB)
	{
	switch (activation)
	{
	case 0: // None
	outA = 0;
	outB = 0;
	return;

	case 1: // Relu
	outArmnnActivation = armnn::ActivationFunction::ReLu;
	outA = 0;
	outB = 0;
	return;

	case 3: // Relu6
	outArmnnActivation = armnn::ActivationFunction::BoundedReLu;
	outA = 6;
	outB = 0;
	return;

	case 4: // Tanh
	outArmnnActivation = armnn::ActivationFunction::TanH;
	outA = 1;
	outB = 1;
	return;

	case 6: // Sigmoid
	outArmnnActivation = armnn::ActivationFunction::Sigmoid;
	outA = 0;
	outB = 0;
	return;

	default:
	throw armnn::Exception("Unsupported activation function: " + std::to_string(activation));
	}
	}

	std::unique_ptr<armnn::ScopedCpuTensorHandle> AssignScopedCpuTensorHandle(const armnn::ConstCpuTensorHandle* ptr)
	{
	if (!ptr)
	{
	return nullptr;
	}

	return std::make_unique<armnn::ScopedCpuTensorHandle>(*ptr);
	}

	} // anonymous namespace

	namespace armnn
	{

	RefLstmFloat32Workload::RefLstmFloat32Workload(const LstmQueueDescriptor &descriptor, const WorkloadInfo &info)
	: Float32Workload<LstmQueueDescriptor>(descriptor, info)
	, m_InputToInputWeightsTensor (AssignScopedCpuTensorHandle(descriptor.m_InputToInputWeights))
	, m_InputToForgetWeightsTensor (AssignScopedCpuTensorHandle(descriptor.m_InputToForgetWeights))
	, m_InputToCellWeightsTensor (AssignScopedCpuTensorHandle(descriptor.m_InputToCellWeights))
	, m_InputToOutputWeightsTensor (AssignScopedCpuTensorHandle(descriptor.m_InputToOutputWeights))
	, m_RecurrentToInputWeightsTensor (AssignScopedCpuTensorHandle(descriptor.m_RecurrentToInputWeights))
	, m_RecurrentToForgetWeightsTensor(AssignScopedCpuTensorHandle(descriptor.m_RecurrentToForgetWeights))
	, m_RecurrentToCellWeightsTensor (AssignScopedCpuTensorHandle(descriptor.m_RecurrentToCellWeights))
	, m_RecurrentToOutputWeightsTensor(AssignScopedCpuTensorHandle(descriptor.m_RecurrentToOutputWeights))
	, m_CellToInputWeightsTensor (AssignScopedCpuTensorHandle(descriptor.m_CellToInputWeights))
	, m_CellToForgetWeightsTensor (AssignScopedCpuTensorHandle(descriptor.m_CellToForgetWeights))
	, m_CellToOutputWeightsTensor (AssignScopedCpuTensorHandle(descriptor.m_CellToOutputWeights))
	, m_InputGateBiasTensor (AssignScopedCpuTensorHandle(descriptor.m_InputGateBias))
	, m_ForgetGateBiasTensor (AssignScopedCpuTensorHandle(descriptor.m_ForgetGateBias))
	, m_CellBiasTensor (AssignScopedCpuTensorHandle(descriptor.m_CellBias))
	, m_OutputGateBiasTensor (AssignScopedCpuTensorHandle(descriptor.m_OutputGateBias))
	, m_ProjectionWeightsTensor (AssignScopedCpuTensorHandle(descriptor.m_ProjectionWeights))
	, m_ProjectionBiasTensor (AssignScopedCpuTensorHandle(descriptor.m_ProjectionBias))
	{}

	void RefLstmFloat32Workload::Execute() const
	{
	// This is a porting of the LSTM::Eval() method in the Android code base
	// Refer to: android/frameworks/ml/nn/common/operations/LSTM.cpp

	const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]);
	const TensorShape& inputShape = inputInfo.GetShape();

	float* scratchBuffer = GetOutputTensorDataFloat(0, m_Data);
	float* outputStateOut = GetOutputTensorDataFloat(1, m_Data);
	float* cellStateOut = GetOutputTensorDataFloat(2, m_Data);
	float* output = GetOutputTensorDataFloat(3, m_Data);

	const float* inputData = GetInputTensorDataFloat(0, m_Data);
	const float* outputStateIn = GetInputTensorDataFloat(1, m_Data);
	const float* cellStateIn = GetInputTensorDataFloat(2, m_Data);

	const uint32_t nBatch = inputShape[0];
	const uint32_t nInput = inputShape[1];

	const uint32_t nCell = m_InputToOutputWeightsTensor->GetShape()[0];
	const uint32_t nOutput = m_RecurrentToOutputWeightsTensor->GetShape()[1];

	const bool useCifg = m_Data.m_Parameters.m_CifgEnabled;
	const bool usePeephole = m_Data.m_Parameters.m_PeepholeEnabled;

	// Index the scratch buffers pointers to the global scratch buffer.
	float* inputGateScratch = nullptr;
	float* cellScratch = nullptr;
	float* forgetGateScratch = nullptr;
	float* outputGateScratch = nullptr;

	if (useCifg)
	{
	cellScratch = scratchBuffer + 0 * nCell * nBatch;
	forgetGateScratch = scratchBuffer + 1 * nCell * nBatch;
	outputGateScratch = scratchBuffer + 2 * nCell * nBatch;
	}
	else
	{
	inputGateScratch = scratchBuffer + 0 * nCell * nBatch;
	cellScratch = scratchBuffer + 1 * nCell * nBatch;
	forgetGateScratch = scratchBuffer + 2 * nCell * nBatch;
	outputGateScratch = scratchBuffer + 3 * nCell * nBatch;
	}

	// Initialize scratch buffers with bias.
	if (!useCifg)
	{
	VectorBatchVectorAssign(m_InputGateBiasTensor->GetTensor<float>(),
	nCell, nBatch, inputGateScratch);
	}
	VectorBatchVectorAssign(m_ForgetGateBiasTensor->GetTensor<float>(),
	nCell, nBatch, forgetGateScratch);
	VectorBatchVectorAssign(m_CellBiasTensor->GetTensor<float>(),
	nCell, nBatch, cellScratch);
	VectorBatchVectorAssign(m_OutputGateBiasTensor->GetTensor<float>(),
	nCell, nBatch, outputGateScratch);

	// For each batch and cell: compute input_weight * input.
	if (!useCifg)
	{
	MatrixBatchVectorMultiplyAccumulate(m_InputToInputWeightsTensor->GetTensor<float>(),
	nCell, nInput, inputData, nBatch, inputGateScratch);
	}
	MatrixBatchVectorMultiplyAccumulate(m_InputToForgetWeightsTensor->GetTensor<float>(),
	nCell, nInput, inputData, nBatch, forgetGateScratch);
	MatrixBatchVectorMultiplyAccumulate(m_InputToCellWeightsTensor->GetTensor<float>(),
	nCell, nInput, inputData, nBatch, cellScratch);
	MatrixBatchVectorMultiplyAccumulate(m_InputToOutputWeightsTensor->GetTensor<float>(),
	nCell, nInput, inputData, nBatch, outputGateScratch);

	// For each batch and cell: compute recurrent_weight * output_state.
	if (!useCifg)
	{
	MatrixBatchVectorMultiplyAccumulate(m_RecurrentToInputWeightsTensor->GetTensor<float>(),
	nCell, nOutput, outputStateIn, nBatch, inputGateScratch);
	}
	MatrixBatchVectorMultiplyAccumulate(m_RecurrentToForgetWeightsTensor->GetTensor<float>(),
	nCell, nOutput, outputStateIn, nBatch, forgetGateScratch);
	MatrixBatchVectorMultiplyAccumulate(m_RecurrentToCellWeightsTensor->GetTensor<float>(),
	nCell, nOutput, outputStateIn, nBatch, cellScratch);
	MatrixBatchVectorMultiplyAccumulate(m_RecurrentToOutputWeightsTensor->GetTensor<float>(),
	nCell, nOutput, outputStateIn, nBatch, outputGateScratch);

	// For each batch and cell: update input gate.
	if (!useCifg)
	{
	if (usePeephole)
	{
	VectorBatchVectorCwiseProductAccumulate(m_CellToInputWeightsTensor->GetTensor<float>(),
	nCell, cellStateIn, nBatch, inputGateScratch);
	}
	Activation(inputGateScratch, inputGateScratch,
	TensorInfo({nCell, nBatch}, DataType::Float32),
	ActivationFunction::Sigmoid, 0, 0);
	}

	// For each batch and cell: update forget gate.
	if (usePeephole)
	{
	VectorBatchVectorCwiseProductAccumulate(m_CellToForgetWeightsTensor->GetTensor<float>(), nCell,
	cellStateIn, nBatch, forgetGateScratch);
	}
	Activation(forgetGateScratch, forgetGateScratch,
	TensorInfo({nCell, nBatch}, DataType::Float32),
	ActivationFunction::Sigmoid, 0, 0);

	// For each batch and cell: update the cell.
	VectorVectorCwiseProduct(forgetGateScratch, cellStateIn, nBatch * nCell, cellStateOut);

	ActivationFunction armnnActivationFunc = ActivationFunction::Sigmoid;
	float a = 0;
	float b = 0;
	SetActivationParameters(m_Data.m_Parameters.m_ActivationFunc, armnnActivationFunc, a, b);

	if (m_Data.m_Parameters.m_ActivationFunc > 0)
	{
	Activation(cellScratch, cellScratch,
	TensorInfo({nCell, nBatch}, DataType::Float32),
	armnnActivationFunc, a, b);
	}
	if (useCifg)
	{
	Sub1Vector(forgetGateScratch, nBatch * nCell, forgetGateScratch);
	VectorVectorCwiseProductAccumulate(cellScratch, forgetGateScratch, nBatch * nCell, cellStateOut);
	}
	else
	{
	VectorVectorCwiseProductAccumulate(cellScratch, inputGateScratch, nBatch * nCell, cellStateOut);
	}
	if (m_Data.m_Parameters.m_ClippingThresCell > 0.0)
	{
	ClipVector(cellStateOut, nBatch * nCell, m_Data.m_Parameters.m_ClippingThresCell, cellStateOut);
	}

	// For each batch and cell: update the output gate.
	if (usePeephole)
	{
	VectorBatchVectorCwiseProductAccumulate(m_CellToOutputWeightsTensor->GetTensor<float>(),
	nCell, cellStateOut, nBatch, outputGateScratch);
	}
	Activation(outputGateScratch, outputGateScratch,
	TensorInfo({nCell, nBatch}, DataType::Float32),
	ActivationFunction::Sigmoid, 0, 0);

	if (m_Data.m_Parameters.m_ActivationFunc > 0)
	{
	Activation(cellStateOut, cellScratch,
	TensorInfo({nCell, nBatch}, DataType::Float32),
	armnnActivationFunc, a, b);
	}
	VectorVectorCwiseProduct(outputGateScratch, cellScratch, nBatch * nCell, outputGateScratch);

	// For each batch: update the projection and output_state.
	if (m_Data.m_Parameters.m_ProjectionEnabled)
	{
	if (m_ProjectionBiasTensor)
	{
	VectorBatchVectorAssign(m_ProjectionBiasTensor->GetTensor<float>(),
	nOutput, nBatch, output);
	}
	MatrixBatchVectorMultiplyAccumulate(m_ProjectionWeightsTensor->GetTensor<float>(),
	nOutput, nCell, outputGateScratch, nBatch, output);

	if (m_Data.m_Parameters.m_ClippingThresProj > 0.0)
	{
	ClipVector(output, nBatch * nOutput, m_Data.m_Parameters.m_ClippingThresProj, output);
	}
	}
	else
	{
	CopyVector(outputGateScratch, nBatch * nOutput, output);
	}

	CopyVector(output, nBatch * nOutput, outputStateOut);
	}

	} //namespace armnn