IVGCVSW-3696 Add NEON ArgMinMax workload and tests

 * Added layer tests and fixed WorkloadData validate.
 * Also enabled copy to/from NEON for Signed32.

Signed-off-by: James Conroy <james.conroy@arm.com>
Change-Id: I5e961f88434e18d5a8ebff956d20a1c2cf1b50bb
diff --git a/src/backends/backendsCommon/WorkloadData.cpp b/src/backends/backendsCommon/WorkloadData.cpp
index 3fbdec7..e49fd09 100644
--- a/src/backends/backendsCommon/WorkloadData.cpp
+++ b/src/backends/backendsCommon/WorkloadData.cpp
@@ -450,10 +450,10 @@
 
     std::vector<DataType> supportedTypes =
     {
-            DataType::Float16,
-            DataType::Float32,
-            DataType::QuantisedAsymm8,
-            DataType::QuantisedSymm16
+        DataType::Float16,
+        DataType::Float32,
+        DataType::QuantisedAsymm8,
+        DataType::QuantisedSymm16
     };
 
     ValidateDataTypes(inputTensorInfo, supportedTypes, descriptorName);
@@ -476,15 +476,15 @@
         throw InvalidArgumentException(descriptorName + ": Output of ArgMinMax layer must be Int32.");
     }
 
-    std::vector<DataType> supportedTypes =
-            {
-                    DataType::Float16,
-                    DataType::Float32,
-                    DataType::QuantisedAsymm8,
-                    DataType::QuantisedSymm16
-            };
+    std::vector<DataType> supportedInputTypes =
+    {
+        DataType::Float16,
+        DataType::Float32,
+        DataType::QuantisedAsymm8,
+        DataType::QuantisedSymm16
+    };
 
-    ValidateDataTypes(inputTensorInfo, supportedTypes, descriptorName);
+    ValidateDataTypes(inputTensorInfo, supportedInputTypes, descriptorName);
 }
 
 void SoftmaxQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const
@@ -499,10 +499,10 @@
 
     std::vector<DataType> supportedTypes =
     {
-            DataType::Float16,
-            DataType::Float32,
-            DataType::QuantisedAsymm8,
-            DataType::QuantisedSymm16
+        DataType::Float16,
+        DataType::Float32,
+        DataType::QuantisedAsymm8,
+        DataType::QuantisedSymm16
     };
 
     ValidateDataTypes(inputTensorInfo, supportedTypes, descriptorName);
@@ -519,12 +519,12 @@
     // Check the supported data types
     std::vector<DataType> supportedTypes =
     {
-            DataType::Float32,
-            DataType::Float16,
-            DataType::Boolean,
-            DataType::Signed32,
-            DataType::QuantisedAsymm8,
-            DataType::QuantisedSymm16
+        DataType::Float32,
+        DataType::Float16,
+        DataType::Boolean,
+        DataType::Signed32,
+        DataType::QuantisedAsymm8,
+        DataType::QuantisedSymm16
     };
 
     const TensorInfo& inputTensorInfo = workloadInfo.m_InputTensorInfos[0];
@@ -655,12 +655,12 @@
     // Check the supported data types
     std::vector<DataType> supportedTypes =
     {
-            DataType::Float32,
-            DataType::Float16,
-            DataType::Boolean,
-            DataType::Signed32,
-            DataType::QuantisedAsymm8,
-            DataType::QuantisedSymm16
+        DataType::Float32,
+        DataType::Float16,
+        DataType::Boolean,
+        DataType::Signed32,
+        DataType::QuantisedAsymm8,
+        DataType::QuantisedSymm16
     };
 
     const TensorInfo& outputTensorInfo = workloadInfo.m_OutputTensorInfos[0];
@@ -742,12 +742,12 @@
     // Check the supported data types
     std::vector<DataType> supportedTypes =
     {
-            DataType::Float32,
-            DataType::Float16,
-            DataType::Boolean,
-            DataType::Signed32,
-            DataType::QuantisedAsymm8,
-            DataType::QuantisedSymm16
+        DataType::Float32,
+        DataType::Float16,
+        DataType::Boolean,
+        DataType::Signed32,
+        DataType::QuantisedAsymm8,
+        DataType::QuantisedSymm16
     };
 
     ValidateDataTypes(workloadInfo.m_InputTensorInfos[0], supportedTypes, descriptorName);
@@ -805,10 +805,10 @@
     // Check the supported data types
     std::vector<DataType> supportedTypes =
     {
-            DataType::Float32,
-            DataType::Float16,
-            DataType::QuantisedAsymm8,
-            DataType::QuantisedSymm16
+        DataType::Float32,
+        DataType::Float16,
+        DataType::QuantisedAsymm8,
+        DataType::QuantisedSymm16
     };
 
     ValidateDataTypes(inputTensorInfo, supportedTypes, descriptorName);
@@ -1937,8 +1937,8 @@
 
     std::vector<DataType> supportedTypes =
     {
-            DataType::Float32,
-            DataType::Float16
+        DataType::Float32,
+        DataType::Float16
     };
 
     ValidateDataTypes(inputTensorInfo, supportedTypes, descriptorName);
@@ -1962,10 +1962,10 @@
 
     std::vector<DataType> supportedTypes =
     {
-            DataType::Float32,
-            DataType::Float16,
-            DataType::QuantisedAsymm8,
-            DataType::QuantisedSymm16
+        DataType::Float32,
+        DataType::Float16,
+        DataType::QuantisedAsymm8,
+        DataType::QuantisedSymm16
     };
 
     ValidateDataTypes(inputTensorInfo, supportedTypes, descriptorName);
@@ -2129,10 +2129,10 @@
 
     std::vector<DataType> supportedTypes =
     {
-            DataType::Float16,
-            DataType::Float32,
-            DataType::QuantisedAsymm8,
-            DataType::QuantisedSymm16
+        DataType::Float16,
+        DataType::Float32,
+        DataType::QuantisedAsymm8,
+        DataType::QuantisedSymm16
     };
 
     ValidateDataTypes(inputTensorInfo, supportedTypes, descriptorName);
@@ -2157,10 +2157,10 @@
 
     std::vector<DataType> supportedTypes =
     {
-            DataType::Float16,
-            DataType::Float32,
-            DataType::QuantisedAsymm8,
-            DataType::QuantisedSymm16
+        DataType::Float16,
+        DataType::Float32,
+        DataType::QuantisedAsymm8,
+        DataType::QuantisedSymm16
     };
 
     ValidateDataTypes(inputTensorInfo, supportedTypes, descriptorName);
@@ -2255,8 +2255,8 @@
 
     std::vector<DataType> supportedTypes =
     {
-            DataType::Float32,
-            DataType::Float16
+        DataType::Float32,
+        DataType::Float16
     };
 
     ValidateDataTypes(outputTensorInfo, supportedTypes, descriptorName);
@@ -2578,12 +2578,12 @@
     ValidateTensorShapesMatch(inputTensorInfo, outputTensorInfo, descriptorName, "input", "output");
 
     std::vector<DataType> supportedTypes =
-        {
-            DataType::Float16,
-            DataType::Float32,
-            DataType::QuantisedAsymm8,
-            DataType::QuantisedSymm16
-        };
+    {
+        DataType::Float16,
+        DataType::Float32,
+        DataType::QuantisedAsymm8,
+        DataType::QuantisedSymm16
+    };
 
     ValidateDataTypes(inputTensorInfo, supportedTypes, descriptorName);
     ValidateTensorDataTypesMatch(inputTensorInfo, outputTensorInfo, descriptorName, "input", "output");
diff --git a/src/backends/backendsCommon/test/layerTests/ArgMinMaxTestImpl.cpp b/src/backends/backendsCommon/test/layerTests/ArgMinMaxTestImpl.cpp
index 9c07599..e023d60 100644
--- a/src/backends/backendsCommon/test/layerTests/ArgMinMaxTestImpl.cpp
+++ b/src/backends/backendsCommon/test/layerTests/ArgMinMaxTestImpl.cpp
@@ -30,7 +30,6 @@
     auto inputTensor = MakeTensor<T, 4>(inputTensorInfo, ConvertToDataType<ArmnnType>(inputData, inputTensorInfo));
 
     LayerTestResult<int32_t, 3> result(outputTensorInfo);
-
     result.outputExpected = MakeTensor<int32_t, 3>(outputTensorInfo, outputData);
 
     std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
@@ -57,7 +56,6 @@
     CopyDataFromITensorHandle(&result.output[0][0][0], outputHandle.get());
 
     return result;
-
 }
 
 } // namespace
@@ -86,7 +84,7 @@
     return ArgMinMaxTestCommon<ArmnnType>(workloadFactory, memoryManager,
                                           armnn::ArgMinMaxFunction::Max,
                                           inputTensorInfo, outputTensorInfo,
-                                          inputValues, outputValues, 3);
+                                          inputValues, outputValues, -1); // axis -1 === 3
 }
 
 template<armnn::DataType ArmnnType, typename T>
@@ -117,43 +115,7 @@
 }
 
 template<armnn::DataType ArmnnType, typename T>
-LayerTestResult<int32_t, 3> ArgMinChannel4dTest(
-        armnn::IWorkloadFactory& workloadFactory,
-        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
-{
-    const armnn::TensorShape inputShape{ 1, 3, 2, 4};
-    const armnn::TensorShape outputShape{ 1, 2, 4 }; // C=1,2,4 H =1,3,4 W=1,3,2
-
-    armnn::TensorInfo inputTensorInfo(inputShape, ArmnnType);
-
-    if(armnn::IsQuantizedType<T>())
-    {
-        inputTensorInfo.SetQuantizationScale(1.0f);
-        inputTensorInfo.SetQuantizationOffset(0);
-    }
-
-    armnn::TensorInfo outputTensorInfo(outputShape, armnn::DataType::Signed32);
-
-    std::vector<float> inputValues({ 1.0f,   2.0f,   3.0f,   4.0f,
-                                     5.0f,   6.0f,   7.0f,   8.0f,
-
-                                     10.0f,  20.0f,  30.0f,  40.0f,
-                                     50.0f,  60.0f,  70.0f,  80.0f,
-
-                                     100.0f, 200.0f, 300.0f, 400.0f,
-                                     500.0f, 600.0f, 700.0f, 800.0f });
-
-    std::vector<int32_t> outputValues({ 0, 0, 0, 0,
-                                        0, 0, 0, 0 });
-
-    return ArgMinMaxTestCommon<ArmnnType>(workloadFactory, memoryManager,
-                                          armnn::ArgMinMaxFunction::Min,
-                                          inputTensorInfo, outputTensorInfo,
-                                          inputValues, outputValues, 1);
-}
-
-template<armnn::DataType ArmnnType, typename T>
-LayerTestResult<int32_t, 3> ArgMaxChannel4dTest(
+LayerTestResult<int32_t, 3> ArgMinChannelTest(
         armnn::IWorkloadFactory& workloadFactory,
         const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
 {
@@ -170,15 +132,49 @@
 
     armnn::TensorInfo outputTensorInfo(outputShape, armnn::DataType::Signed32);
 
-    std::vector<float> inputValues({ 1.0f,   2.0f,   3.0f,   4.0f,
-                                     5.0f,   6.0f,   7.0f,   8.0f,
+    std::vector<float> inputValues({   1.0f,   2.0f,   3.0f,   4.0f,
+                                       5.0f,   6.0f,   7.0f,   8.0f,
+
+                                      10.0f,  20.0f,  30.0f,  40.0f,
+                                      50.0f,  60.0f,  70.0f,  80.0f,
+
+                                     100.0f, 200.0f, 300.0f, 400.0f,
+                                     500.0f, 600.0f, 700.0f, 800.0f });
+    std::vector<int32_t> outputValues({ 0, 0, 0, 0,
+                                        0, 0, 0, 0 });
+
+    return ArgMinMaxTestCommon<ArmnnType>(workloadFactory, memoryManager,
+                                          armnn::ArgMinMaxFunction::Min,
+                                          inputTensorInfo, outputTensorInfo,
+                                          inputValues, outputValues, 1);
+}
+
+template<armnn::DataType ArmnnType, typename T>
+LayerTestResult<int32_t, 3> ArgMaxChannelTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    const armnn::TensorShape inputShape{ 1, 3, 2, 4};
+    const armnn::TensorShape outputShape{ 1, 2, 4 };
+
+    armnn::TensorInfo inputTensorInfo(inputShape, ArmnnType);
+
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(1.0f);
+        inputTensorInfo.SetQuantizationOffset(0);
+    }
+
+    armnn::TensorInfo outputTensorInfo(outputShape, armnn::DataType::Signed32);
+
+    std::vector<float> inputValues({  1.0f,   2.0f,   3.0f,   4.0f,
+                                      5.0f,   6.0f,   7.0f,   8.0f,
 
                                      10.0f,  20.0f,  30.0f,  40.0f,
                                      50.0f,  60.0f,  70.0f,  80.0f,
 
-                                     100.0f, 200.0f, 300.0f, 400.0f,
-                                     500.0f, 600.0f, 700.0f, 800.0f });
-
+                                    100.0f, 200.0f, 300.0f, 400.0f,
+                                    500.0f, 600.0f, 700.0f, 800.0f });
     std::vector<int32_t> outputValues({ 2, 2, 2, 2,
                                         2, 2, 2, 2 });
 
@@ -188,6 +184,64 @@
                                           inputValues, outputValues, 1);
 }
 
+template<armnn::DataType ArmnnType, typename T>
+LayerTestResult<int32_t, 3> ArgMaxHeightTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    const armnn::TensorShape inputShape{ 1, 3, 2, 4};
+    const armnn::TensorShape outputShape{ 3, 1, 4 };
+
+    armnn::TensorInfo inputTensorInfo(inputShape, ArmnnType);
+    armnn::TensorInfo outputTensorInfo(outputShape, armnn::DataType::Signed32);
+
+    std::vector<float> inputValues({  1.0f,   2.0f,   3.0f,   4.0f,
+                                      5.0f,   6.0f,   7.0f,   8.0f,
+
+                                     10.0f,  20.0f,  30.0f,  40.0f,
+                                     50.0f,  60.0f,  70.0f,  80.0f,
+
+                                    100.0f, 200.0f, 300.0f, 400.0f,
+                                    500.0f, 600.0f, 700.0f, 800.0f });
+    std::vector<int32_t> outputValues({ 1, 1, 1, 1,
+                                        1, 1, 1, 1,
+                                        1, 1, 1, 1 });
+
+    return ArgMinMaxTestCommon<ArmnnType>(workloadFactory, memoryManager,
+                                          armnn::ArgMinMaxFunction::Max,
+                                          inputTensorInfo, outputTensorInfo,
+                                          inputValues, outputValues, 2);
+}
+
+template<armnn::DataType ArmnnType, typename T>
+LayerTestResult<int32_t, 3> ArgMinWidthTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    const armnn::TensorShape inputShape{ 1, 3, 2, 4};
+    const armnn::TensorShape outputShape{ 3, 2, 1 };
+
+    armnn::TensorInfo inputTensorInfo(inputShape, ArmnnType);
+    armnn::TensorInfo outputTensorInfo(outputShape, armnn::DataType::Signed32);
+
+    std::vector<float> inputValues({  1.0f,   2.0f,   3.0f,   4.0f,
+                                      5.0f,   6.0f,   7.0f,   8.0f,
+
+                                     10.0f,  20.0f,  30.0f,  40.0f,
+                                     50.0f,  60.0f,  70.0f,  80.0f,
+
+                                    100.0f, 200.0f, 300.0f, 400.0f,
+                                    500.0f, 600.0f, 700.0f, 800.0f });
+    std::vector<int32_t> outputValues({ 0, 0,
+                                        0, 0,
+                                        0, 0 });
+
+    return ArgMinMaxTestCommon<ArmnnType>(workloadFactory, memoryManager,
+                                          armnn::ArgMinMaxFunction::Min,
+                                          inputTensorInfo, outputTensorInfo,
+                                          inputValues, outputValues, 3);
+}
+
 
 // Explicit template specializations
 
@@ -197,56 +251,66 @@
         const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
 
 template LayerTestResult<int32_t, 3>
-ArgMinSimpleTest<armnn::DataType::Float32>(
-        armnn::IWorkloadFactory& workloadFactory,
-        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
-
-template LayerTestResult<int32_t, 3>
 ArgMaxSimpleTest<armnn::DataType::QuantisedAsymm8>(
         armnn::IWorkloadFactory& workloadFactory,
         const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
 
 template LayerTestResult<int32_t, 3>
-ArgMinSimpleTest<armnn::DataType::QuantisedAsymm8>(
-        armnn::IWorkloadFactory& workloadFactory,
-        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
-
-template LayerTestResult<int32_t, 3>
 ArgMaxSimpleTest<armnn::DataType::QuantisedSymm16>(
         armnn::IWorkloadFactory& workloadFactory,
         const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
 
 template LayerTestResult<int32_t, 3>
+ArgMinSimpleTest<armnn::DataType::Float32>(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+template LayerTestResult<int32_t, 3>
+ArgMinSimpleTest<armnn::DataType::QuantisedAsymm8>(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+template LayerTestResult<int32_t, 3>
 ArgMinSimpleTest<armnn::DataType::QuantisedSymm16>(
         armnn::IWorkloadFactory& workloadFactory,
         const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
 
 template LayerTestResult<int32_t, 3>
-ArgMinChannel4dTest<armnn::DataType::Float32>(
+ArgMinChannelTest<armnn::DataType::Float32>(
         armnn::IWorkloadFactory& workloadFactory,
         const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
 
 template LayerTestResult<int32_t, 3>
-ArgMinChannel4dTest<armnn::DataType::QuantisedAsymm8>(
+ArgMinChannelTest<armnn::DataType::QuantisedAsymm8>(
         armnn::IWorkloadFactory& workloadFactory,
         const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
 
 template LayerTestResult<int32_t, 3>
-ArgMinChannel4dTest<armnn::DataType::QuantisedSymm16>(
+ArgMinChannelTest<armnn::DataType::QuantisedSymm16>(
         armnn::IWorkloadFactory& workloadFactory,
         const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
 
 template LayerTestResult<int32_t, 3>
-ArgMaxChannel4dTest<armnn::DataType::Float32>(
+ArgMaxChannelTest<armnn::DataType::Float32>(
         armnn::IWorkloadFactory& workloadFactory,
         const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
 
 template LayerTestResult<int32_t, 3>
-ArgMaxChannel4dTest<armnn::DataType::QuantisedAsymm8>(
+ArgMaxChannelTest<armnn::DataType::QuantisedAsymm8>(
         armnn::IWorkloadFactory& workloadFactory,
         const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
 
 template LayerTestResult<int32_t, 3>
-ArgMaxChannel4dTest<armnn::DataType::QuantisedSymm16>(
+ArgMaxChannelTest<armnn::DataType::QuantisedSymm16>(
         armnn::IWorkloadFactory& workloadFactory,
-        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
\ No newline at end of file
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+template LayerTestResult<int32_t, 3>
+ArgMaxHeightTest<armnn::DataType::Float32>(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+template LayerTestResult<int32_t, 3>
+ArgMinWidthTest<armnn::DataType::Float32>(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
diff --git a/src/backends/backendsCommon/test/layerTests/ArgMinMaxTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/ArgMinMaxTestImpl.hpp
index 79d77d4..b3bd7db 100644
--- a/src/backends/backendsCommon/test/layerTests/ArgMinMaxTestImpl.hpp
+++ b/src/backends/backendsCommon/test/layerTests/ArgMinMaxTestImpl.hpp
@@ -21,9 +21,17 @@
                                              const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
 
 template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
-LayerTestResult<int32_t, 3> ArgMinChannel4dTest(armnn::IWorkloadFactory& workloadFactory,
-                                                const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+LayerTestResult<int32_t, 3> ArgMinChannelTest(armnn::IWorkloadFactory& workloadFactory,
+                                              const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
 
 template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
-LayerTestResult<int32_t, 3> ArgMaxChannel4dTest(armnn::IWorkloadFactory& workloadFactory,
-                                                const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
\ No newline at end of file
+LayerTestResult<int32_t, 3> ArgMaxChannelTest(armnn::IWorkloadFactory& workloadFactory,
+                                              const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<int32_t, 3> ArgMaxHeightTest(armnn::IWorkloadFactory& workloadFactory,
+                                             const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<int32_t, 3> ArgMinWidthTest(armnn::IWorkloadFactory& workloadFactory,
+                                            const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
\ No newline at end of file
diff --git a/src/backends/neon/NeonLayerSupport.cpp b/src/backends/neon/NeonLayerSupport.cpp
index b713aba..2f3643f 100644
--- a/src/backends/neon/NeonLayerSupport.cpp
+++ b/src/backends/neon/NeonLayerSupport.cpp
@@ -21,6 +21,7 @@
 #include "workloads/NeonAbsWorkload.hpp"
 #include "workloads/NeonAdditionWorkload.hpp"
 #include "workloads/NeonActivationWorkload.hpp"
+#include "workloads/NeonArgMinMaxWorkload.hpp"
 #include "workloads/NeonBatchNormalizationWorkload.hpp"
 #include "workloads/NeonConvolution2dWorkload.hpp"
 #include "workloads/NeonDepthToSpaceWorkload.hpp"
@@ -146,6 +147,18 @@
                                    output);
 }
 
+bool NeonLayerSupport::IsArgMinMaxSupported(const TensorInfo& input,
+                                            const TensorInfo& output,
+                                            const ArgMinMaxDescriptor& descriptor,
+                                            Optional<std::string&> reasonIfUnsupported) const
+{
+    FORWARD_WORKLOAD_VALIDATE_FUNC(NeonArgMinMaxWorkloadValidate,
+                                   reasonIfUnsupported,
+                                   input,
+                                   output,
+                                   descriptor);
+}
+
 bool NeonLayerSupport::IsBatchNormalizationSupported(const TensorInfo& input,
                                                      const TensorInfo& output,
                                                      const TensorInfo& mean,
diff --git a/src/backends/neon/NeonLayerSupport.hpp b/src/backends/neon/NeonLayerSupport.hpp
index 31f9e57..76eb342 100644
--- a/src/backends/neon/NeonLayerSupport.hpp
+++ b/src/backends/neon/NeonLayerSupport.hpp
@@ -26,6 +26,11 @@
                              const TensorInfo& output,
                              Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
 
+    bool IsArgMinMaxSupported(const TensorInfo& input,
+                              const TensorInfo& output,
+                              const ArgMinMaxDescriptor& descriptor,
+                              Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
+
     bool IsBatchNormalizationSupported(const TensorInfo& input,
                                        const TensorInfo& output,
                                        const TensorInfo& mean,
diff --git a/src/backends/neon/NeonTensorHandle.hpp b/src/backends/neon/NeonTensorHandle.hpp
index 37013eb..ca5bfb0 100644
--- a/src/backends/neon/NeonTensorHandle.hpp
+++ b/src/backends/neon/NeonTensorHandle.hpp
@@ -186,6 +186,10 @@
                 armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
                                                                  static_cast<int16_t*>(memory));
                 break;
+            case arm_compute::DataType::S32:
+                armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+                                                                 static_cast<int32_t*>(memory));
+                break;
             default:
             {
                 throw armnn::UnimplementedException();
@@ -216,6 +220,10 @@
                 armcomputetensorutils::CopyArmComputeITensorData(static_cast<const int16_t*>(memory),
                                                                  this->GetTensor());
                 break;
+            case arm_compute::DataType::S32:
+                armcomputetensorutils::CopyArmComputeITensorData(static_cast<const int32_t*>(memory),
+                                                                 this->GetTensor());
+                break;
             default:
             {
                 throw armnn::UnimplementedException();
@@ -292,6 +300,10 @@
                 armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
                                                                  static_cast<int16_t*>(memory));
                 break;
+            case arm_compute::DataType::S32:
+                armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+                                                                 static_cast<int32_t*>(memory));
+                break;
             default:
             {
                 throw armnn::UnimplementedException();
@@ -318,6 +330,10 @@
                 armcomputetensorutils::CopyArmComputeITensorData(static_cast<const int16_t*>(memory),
                                                                  this->GetTensor());
                 break;
+            case arm_compute::DataType::S32:
+                armcomputetensorutils::CopyArmComputeITensorData(static_cast<const int32_t*>(memory),
+                                                                 this->GetTensor());
+                break;
             default:
             {
                 throw armnn::UnimplementedException();
diff --git a/src/backends/neon/NeonWorkloadFactory.cpp b/src/backends/neon/NeonWorkloadFactory.cpp
index f0b738c..3492923 100644
--- a/src/backends/neon/NeonWorkloadFactory.cpp
+++ b/src/backends/neon/NeonWorkloadFactory.cpp
@@ -442,4 +442,10 @@
     return std::make_unique<NeonStackWorkload>(descriptor, info);
 }
 
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateArgMinMax(const ArgMinMaxQueueDescriptor& descriptor,
+                                                                const WorkloadInfo& info) const
+{
+    return std::make_unique<NeonArgMinMaxWorkload>(descriptor, info);
+}
+
 } // namespace armnn
diff --git a/src/backends/neon/NeonWorkloadFactory.hpp b/src/backends/neon/NeonWorkloadFactory.hpp
index 4bdbc8e..aad9cf9 100644
--- a/src/backends/neon/NeonWorkloadFactory.hpp
+++ b/src/backends/neon/NeonWorkloadFactory.hpp
@@ -200,6 +200,9 @@
     std::unique_ptr<IWorkload> CreateStack(const StackQueueDescriptor& descriptor,
                                            const WorkloadInfo& info) const override;
 
+    std::unique_ptr<IWorkload> CreateArgMinMax(const ArgMinMaxQueueDescriptor& descriptor,
+                                               const WorkloadInfo& info) const override;
+
 private:
     mutable std::shared_ptr<NeonMemoryManager> m_MemoryManager;
 };
diff --git a/src/backends/neon/backend.mk b/src/backends/neon/backend.mk
index 3e46387..fb10a0d 100644
--- a/src/backends/neon/backend.mk
+++ b/src/backends/neon/backend.mk
@@ -24,6 +24,7 @@
         workloads/NeonAbsWorkload.cpp \
         workloads/NeonActivationWorkload.cpp \
         workloads/NeonAdditionWorkload.cpp \
+        workloads/NeonArgMinMaxWorkload.cpp \
         workloads/NeonBatchNormalizationWorkload.cpp \
         workloads/NeonConcatWorkload.cpp \
         workloads/NeonConstantWorkload.cpp \
diff --git a/src/backends/neon/test/NeonLayerTests.cpp b/src/backends/neon/test/NeonLayerTests.cpp
index 3877183..0d1faa9 100644
--- a/src/backends/neon/test/NeonLayerTests.cpp
+++ b/src/backends/neon/test/NeonLayerTests.cpp
@@ -910,6 +910,14 @@
 ARMNN_AUTO_TEST_CASE(RsqrtZero, RsqrtZeroTest<DataType::Float32>)
 ARMNN_AUTO_TEST_CASE(RsqrtNegative, RsqrtNegativeTest<DataType::Float32>)
 
+// ArgMinMax
+ARMNN_AUTO_TEST_CASE(ArgMinFloat32, ArgMinSimpleTest<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE(ArgMaxFloat32, ArgMaxSimpleTest<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE(ArgMinChannel, ArgMinChannelTest<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE(ArgMaxChannel, ArgMaxChannelTest<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE(ArgMaxHeight, ArgMaxHeightTest<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE(ArgMinWidth, ArgMinWidthTest<DataType::Float32>)
+
 #if defined(ARMNNREF_ENABLED)
 
 // The ARMNN_COMPARE_REF_AUTO_TEST_CASE and the ARMNN_COMPARE_REF_FIXTURE_TEST_CASE test units are not available
diff --git a/src/backends/neon/workloads/CMakeLists.txt b/src/backends/neon/workloads/CMakeLists.txt
index 42ac641..f8d5922 100644
--- a/src/backends/neon/workloads/CMakeLists.txt
+++ b/src/backends/neon/workloads/CMakeLists.txt
@@ -10,6 +10,8 @@
     NeonActivationWorkload.hpp
     NeonAdditionWorkload.cpp
     NeonAdditionWorkload.hpp
+    NeonArgMinMaxWorkload.cpp
+    NeonArgMinMaxWorkload.hpp
     NeonBatchNormalizationWorkload.cpp
     NeonBatchNormalizationWorkload.hpp
     NeonConcatWorkload.cpp
diff --git a/src/backends/neon/workloads/NeonArgMinMaxWorkload.cpp b/src/backends/neon/workloads/NeonArgMinMaxWorkload.cpp
new file mode 100644
index 0000000..e8d537f
--- /dev/null
+++ b/src/backends/neon/workloads/NeonArgMinMaxWorkload.cpp
@@ -0,0 +1,79 @@
+//
+// Copyright © 2019 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonArgMinMaxWorkload.hpp"
+#include "NeonWorkloadUtils.hpp"
+
+#include <aclCommon/ArmComputeTensorUtils.hpp>
+#include <backendsCommon/CpuTensorHandle.hpp>
+#include <TensorUtils.hpp>
+
+#include <arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h>
+
+namespace
+{
+unsigned int CalcAclAxis(unsigned int numDimensions, unsigned int axisIndex)
+{
+    return (numDimensions - axisIndex) - 1;
+}
+
+} //namespace
+
+namespace armnn
+{
+
+arm_compute::Status NeonArgMinMaxWorkloadValidate(const TensorInfo& input,
+                                                  const TensorInfo& output,
+                                                  const ArgMinMaxDescriptor& descriptor)
+{
+    const arm_compute::TensorInfo aclInput = armcomputetensorutils::BuildArmComputeTensorInfo(input);
+    const arm_compute::TensorInfo aclOutput = armcomputetensorutils::BuildArmComputeTensorInfo(output);
+
+    auto numDims = input.GetNumDimensions();
+    auto unsignedAxis = armnnUtils::GetUnsignedAxis(numDims, descriptor.m_Axis);
+    int aclAxis = boost::numeric_cast<int>(CalcAclAxis(numDims, unsignedAxis));
+
+    if (descriptor.m_Function == ArgMinMaxFunction::Max)
+    {
+        return arm_compute::NEArgMinMaxLayer::validate(&aclInput, aclAxis, &aclOutput,
+                                                       arm_compute::ReductionOperation::ARG_IDX_MAX);
+    }
+    else
+    {
+        return arm_compute::NEArgMinMaxLayer::validate(&aclInput, aclAxis, &aclOutput,
+                                                       arm_compute::ReductionOperation::ARG_IDX_MIN);
+    }
+}
+
+
+NeonArgMinMaxWorkload::NeonArgMinMaxWorkload(const ArgMinMaxQueueDescriptor& descriptor,
+                                             const WorkloadInfo& info)
+        : BaseWorkload<ArgMinMaxQueueDescriptor>(descriptor, info)
+{
+    arm_compute::ITensor& input = boost::polymorphic_downcast<IAclTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+    arm_compute::ITensor& output = boost::polymorphic_downcast<IAclTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+
+    auto numDims = info.m_InputTensorInfos[0].GetNumDimensions();
+    auto unsignedAxis = armnnUtils::GetUnsignedAxis(numDims, m_Data.m_Parameters.m_Axis);
+    int aclAxis = boost::numeric_cast<int>(CalcAclAxis(numDims, unsignedAxis));
+
+    if (m_Data.m_Parameters.m_Function == ArgMinMaxFunction::Max)
+    {
+        m_ArgMinMaxLayer.configure(&input, aclAxis, &output, arm_compute::ReductionOperation::ARG_IDX_MAX);
+    }
+    else
+    {
+        m_ArgMinMaxLayer.configure(&input, aclAxis, &output, arm_compute::ReductionOperation::ARG_IDX_MIN);
+    }
+}
+
+void NeonArgMinMaxWorkload::Execute() const
+{
+    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonArgMinMaxWorkload_Execute");
+    m_ArgMinMaxLayer.run();
+}
+
+} //namespace armnn
+
diff --git a/src/backends/neon/workloads/NeonArgMinMaxWorkload.hpp b/src/backends/neon/workloads/NeonArgMinMaxWorkload.hpp
new file mode 100644
index 0000000..6301b13
--- /dev/null
+++ b/src/backends/neon/workloads/NeonArgMinMaxWorkload.hpp
@@ -0,0 +1,29 @@
+//
+// Copyright © 2019 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backendsCommon/Workload.hpp>
+
+#include <arm_compute/core/Error.h>
+#include <arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h>
+namespace armnn
+{
+
+arm_compute::Status NeonArgMinMaxWorkloadValidate(const TensorInfo& input,
+                                                  const TensorInfo& output,
+                                                  const ArgMinMaxDescriptor& descriptor);
+
+class NeonArgMinMaxWorkload : public BaseWorkload<ArgMinMaxQueueDescriptor>
+{
+public:
+    NeonArgMinMaxWorkload(const ArgMinMaxQueueDescriptor& descriptor, const WorkloadInfo& info);
+    virtual void Execute() const override;
+
+private:
+    mutable arm_compute::NEArgMinMaxLayer m_ArgMinMaxLayer;
+};
+
+} //namespace armnn
diff --git a/src/backends/neon/workloads/NeonWorkloads.hpp b/src/backends/neon/workloads/NeonWorkloads.hpp
index 9d35ed4..8044a4f 100644
--- a/src/backends/neon/workloads/NeonWorkloads.hpp
+++ b/src/backends/neon/workloads/NeonWorkloads.hpp
@@ -7,6 +7,7 @@
 #include "NeonAbsWorkload.hpp"
 #include "NeonActivationWorkload.hpp"
 #include "NeonAdditionWorkload.hpp"
+#include "NeonArgMinMaxWorkload.hpp"
 #include "NeonBatchNormalizationWorkload.hpp"
 #include "NeonConstantWorkload.hpp"
 #include "NeonConvertFp16ToFp32Workload.hpp"
diff --git a/src/backends/reference/test/RefLayerTests.cpp b/src/backends/reference/test/RefLayerTests.cpp
index 901017a..0058e15 100644
--- a/src/backends/reference/test/RefLayerTests.cpp
+++ b/src/backends/reference/test/RefLayerTests.cpp
@@ -1025,18 +1025,20 @@
 // ArgMinMax
 ARMNN_AUTO_TEST_CASE(ArgMaxFloat32, ArgMaxSimpleTest<DataType::Float32>)
 ARMNN_AUTO_TEST_CASE(ArgMinFloat32, ArgMinSimpleTest<DataType::Float32>)
-ARMNN_AUTO_TEST_CASE(ArgMinChannel4dFloat32, ArgMinChannel4dTest<DataType::Float32>)
-ARMNN_AUTO_TEST_CASE(ArgMaxChannel4dFloat32, ArgMaxChannel4dTest<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE(ArgMinChannelFloat32, ArgMinChannelTest<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE(ArgMaxChannelFloat32, ArgMaxChannelTest<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE(ArgMaxHeightFloat32, ArgMaxHeightTest<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE(ArgMinWidthFloat32, ArgMinWidthTest<DataType::Float32>)
 
-ARMNN_AUTO_TEST_CASE(ArgMaxQuantisedAsymm8, ArgMaxSimpleTest<DataType::QuantisedAsymm8>)
-ARMNN_AUTO_TEST_CASE(ArgMinQuantisedAsymm8, ArgMinSimpleTest<DataType::QuantisedAsymm8>)
-ARMNN_AUTO_TEST_CASE(ArgMinChannel4dQuantisedAsymm8, ArgMinChannel4dTest<DataType::QuantisedAsymm8>)
-ARMNN_AUTO_TEST_CASE(ArgMaxChannel4dQuantisedAsymm8, ArgMaxChannel4dTest<DataType::QuantisedAsymm8>)
+ARMNN_AUTO_TEST_CASE(ArgMaxSimpleQuantisedAsymm8, ArgMaxSimpleTest<DataType::QuantisedAsymm8>)
+ARMNN_AUTO_TEST_CASE(ArgMinSimpleQuantisedAsymm8, ArgMinSimpleTest<DataType::QuantisedAsymm8>)
+ARMNN_AUTO_TEST_CASE(ArgMinChannelQuantisedAsymm8, ArgMinChannelTest<DataType::QuantisedAsymm8>)
+ARMNN_AUTO_TEST_CASE(ArgMaxChannelQuantisedAsymm8, ArgMaxChannelTest<DataType::QuantisedAsymm8>)
 
-ARMNN_AUTO_TEST_CASE(ArgMaxQuantisedSymm16, ArgMaxSimpleTest<DataType::QuantisedSymm16>)
-ARMNN_AUTO_TEST_CASE(ArgMinQuantisedSymm16, ArgMinSimpleTest<DataType::QuantisedSymm16>)
-ARMNN_AUTO_TEST_CASE(ArgMinChannel4dQuantisedSymm16, ArgMinChannel4dTest<DataType::QuantisedSymm16>)
-ARMNN_AUTO_TEST_CASE(ArgMaxChannel4dQuantisedSymm16, ArgMaxChannel4dTest<DataType::QuantisedSymm16>)
+ARMNN_AUTO_TEST_CASE(ArgMaxSimpleQuantisedSymm16, ArgMaxSimpleTest<DataType::QuantisedSymm16>)
+ARMNN_AUTO_TEST_CASE(ArgMinSimpleQuantisedSymm16, ArgMinSimpleTest<DataType::QuantisedSymm16>)
+ARMNN_AUTO_TEST_CASE(ArgMinChannelQuantisedSymm16, ArgMinChannelTest<DataType::QuantisedSymm16>)
+ARMNN_AUTO_TEST_CASE(ArgMaxChannelQuantisedSymm16, ArgMaxChannelTest<DataType::QuantisedSymm16>)
 
 // Space To Batch Nd
 ARMNN_AUTO_TEST_CASE(SpaceToBatchNdSimpleFloat32, SpaceToBatchNdSimpleFloat32Test)