IVGCVSW-2971 Support QSymm16 for DetectionPostProcess workloads

Signed-off-by: Aron Virginas-Tar <Aron.Virginas-Tar@arm.com>
Change-Id: I8af45afe851a9ccbf8bce54727147fcd52ac9a1f
diff --git a/src/backends/backendsCommon/WorkloadData.cpp b/src/backends/backendsCommon/WorkloadData.cpp
index a373f55..d0aaf1d 100644
--- a/src/backends/backendsCommon/WorkloadData.cpp
+++ b/src/backends/backendsCommon/WorkloadData.cpp
@@ -1459,53 +1459,63 @@
 
 void DetectionPostProcessQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const
 {
-    ValidateNumInputs(workloadInfo, "DetectionPostProcessQueueDescriptor", 2);
+    const std::string& descriptorName = " DetectionPostProcessQueueDescriptor";
+    ValidateNumInputs(workloadInfo, descriptorName, 2);
 
     if (workloadInfo.m_OutputTensorInfos.size() != 4)
     {
-        throw InvalidArgumentException("DetectionPostProcessQueueDescriptor: Requires exactly four outputs. " +
+        throw InvalidArgumentException(descriptorName + ": Requires exactly four outputs. " +
                                        to_string(workloadInfo.m_OutputTensorInfos.size()) + " has been provided.");
     }
 
     if (m_Anchors == nullptr)
     {
-        throw InvalidArgumentException("DetectionPostProcessQueueDescriptor: Anchors tensor descriptor is missing.");
+        throw InvalidArgumentException(descriptorName + ": Anchors tensor descriptor is missing.");
     }
 
     const TensorInfo& boxEncodingsInfo =  workloadInfo.m_InputTensorInfos[0];
-    const TensorInfo& scoresInfo =  workloadInfo.m_InputTensorInfos[1];
-    const TensorInfo& anchorsInfo = m_Anchors->GetTensorInfo();
-    const TensorInfo& detectionBoxesInfo = workloadInfo.m_OutputTensorInfos[0];
+    const TensorInfo& scoresInfo       =  workloadInfo.m_InputTensorInfos[1];
+    const TensorInfo& anchorsInfo      = m_Anchors->GetTensorInfo();
+
+    const TensorInfo& detectionBoxesInfo   = workloadInfo.m_OutputTensorInfos[0];
     const TensorInfo& detectionClassesInfo = workloadInfo.m_OutputTensorInfos[1];
-    const TensorInfo& detectionScoresInfo = workloadInfo.m_OutputTensorInfos[2];
-    const TensorInfo& numDetectionsInfo = workloadInfo.m_OutputTensorInfos[3];
+    const TensorInfo& detectionScoresInfo  = workloadInfo.m_OutputTensorInfos[2];
+    const TensorInfo& numDetectionsInfo    = workloadInfo.m_OutputTensorInfos[3];
 
-    ValidateTensorNumDimensions(boxEncodingsInfo, "DetectionPostProcessQueueDescriptor", 3, "box encodings");
-    ValidateTensorNumDimensions(scoresInfo, "DetectionPostProcessQueueDescriptor", 3, "scores");
-    ValidateTensorNumDimensions(anchorsInfo, "DetectionPostProcessQueueDescriptor", 2, "anchors");
+    ValidateTensorNumDimensions(boxEncodingsInfo, descriptorName, 3, "box encodings");
+    ValidateTensorNumDimensions(scoresInfo, descriptorName, 3, "scores");
+    ValidateTensorNumDimensions(anchorsInfo, descriptorName, 2, "anchors");
 
-    ValidateTensorNumDimensions(detectionBoxesInfo, "DetectionPostProcessQueueDescriptor", 3, "detection boxes");
-    ValidateTensorNumDimensions(detectionScoresInfo, "DetectionPostProcessQueueDescriptor", 2, "detection scores");
-    ValidateTensorNumDimensions(detectionClassesInfo, "DetectionPostProcessQueueDescriptor", 2, "detection classes");
-    ValidateTensorNumDimensions(numDetectionsInfo, "DetectionPostProcessQueueDescriptor", 1, "num detections");
+    const std::vector<DataType> supportedInputTypes =
+    {
+        DataType::Float32,
+        DataType::QuantisedAsymm8,
+        DataType::QuantisedSymm16
+    };
 
-    ValidateTensorDataType(detectionBoxesInfo, DataType::Float32,
-                          "DetectionPostProcessQueueDescriptor", "detection boxes");
-    ValidateTensorDataType(detectionScoresInfo, DataType::Float32,
-                          "DetectionPostProcessQueueDescriptor", "detection scores");
-    ValidateTensorDataType(detectionClassesInfo, DataType::Float32,
-                          "DetectionPostProcessQueueDescriptor", "detection classes");
-    ValidateTensorDataType(numDetectionsInfo, DataType::Float32,
-                          "DetectionPostProcessQueueDescriptor", "num detections");
+    ValidateDataTypes(boxEncodingsInfo, supportedInputTypes, descriptorName);
+    ValidateDataTypes(scoresInfo, supportedInputTypes, descriptorName);
+    ValidateDataTypes(anchorsInfo, supportedInputTypes, descriptorName);
+
+    ValidateTensorNumDimensions(detectionBoxesInfo, descriptorName, 3, "detection boxes");
+    ValidateTensorNumDimensions(detectionScoresInfo, descriptorName, 2, "detection scores");
+    ValidateTensorNumDimensions(detectionClassesInfo, descriptorName, 2, "detection classes");
+    ValidateTensorNumDimensions(numDetectionsInfo, descriptorName, 1, "num detections");
+
+    // NOTE: Output is always Float32 regardless of input type
+    ValidateTensorDataType(detectionBoxesInfo, DataType::Float32, descriptorName, "detection boxes");
+    ValidateTensorDataType(detectionScoresInfo, DataType::Float32, descriptorName, "detection scores");
+    ValidateTensorDataType(detectionClassesInfo, DataType::Float32, descriptorName, "detection classes");
+    ValidateTensorDataType(numDetectionsInfo, DataType::Float32, descriptorName, "num detections");
 
     if (m_Parameters.m_NmsIouThreshold <= 0.0f || m_Parameters.m_NmsIouThreshold > 1.0f)
     {
-        throw InvalidArgumentException("DetectionPostProcessQueueDescriptor: Intersection over union threshold "
+        throw InvalidArgumentException(descriptorName + ": Intersection over union threshold "
                                        "must be positive and less than or equal to 1.");
     }
     if (scoresInfo.GetShape()[2] != m_Parameters.m_NumClasses + 1)
     {
-        throw InvalidArgumentException("DetectionPostProcessQueueDescriptor: Number of classes with background "
+        throw InvalidArgumentException(descriptorName + ": Number of classes with background "
                                        "should be equal to number of classes + 1.");
     }
 }
diff --git a/src/backends/backendsCommon/test/DetectionPostProcessLayerTestImpl.hpp b/src/backends/backendsCommon/test/DetectionPostProcessLayerTestImpl.hpp
index 092ce26..2726fde 100644
--- a/src/backends/backendsCommon/test/DetectionPostProcessLayerTestImpl.hpp
+++ b/src/backends/backendsCommon/test/DetectionPostProcessLayerTestImpl.hpp
@@ -15,7 +15,124 @@
 #include <backendsCommon/test/WorkloadFactoryHelper.hpp>
 #include <test/TensorHelpers.hpp>
 
-template <typename FactoryType, armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+namespace
+{
+
+using FloatData = std::vector<float>;
+using QuantData = std::pair<float, int32_t>;
+
+struct TestData
+{
+    static const armnn::TensorShape s_BoxEncodingsShape;
+    static const armnn::TensorShape s_ScoresShape;
+    static const armnn::TensorShape s_AnchorsShape;
+
+    static const QuantData s_BoxEncodingsQuantData;
+    static const QuantData s_ScoresQuantData;
+    static const QuantData s_AnchorsQuantData;
+
+    static const FloatData s_BoxEncodings;
+    static const FloatData s_Scores;
+    static const FloatData s_Anchors;
+};
+
+struct RegularNmsExpectedResults
+{
+    static const FloatData s_DetectionBoxes;
+    static const FloatData s_DetectionScores;
+    static const FloatData s_DetectionClasses;
+    static const FloatData s_NumDetections;
+};
+
+struct FastNmsExpectedResults
+{
+    static const FloatData s_DetectionBoxes;
+    static const FloatData s_DetectionScores;
+    static const FloatData s_DetectionClasses;
+    static const FloatData s_NumDetections;
+};
+
+const armnn::TensorShape TestData::s_BoxEncodingsShape = { 1, 6, 4 };
+const armnn::TensorShape TestData::s_ScoresShape       = { 1, 6, 3 };
+const armnn::TensorShape TestData::s_AnchorsShape      = { 6, 4 };
+
+const QuantData TestData::s_BoxEncodingsQuantData = { 1.00f, 1 };
+const QuantData TestData::s_ScoresQuantData       = { 0.01f, 0 };
+const QuantData TestData::s_AnchorsQuantData      = { 0.50f, 0 };
+
+const FloatData TestData::s_BoxEncodings =
+{
+    0.0f,  0.0f, 0.0f, 0.0f,
+    0.0f,  1.0f, 0.0f, 0.0f,
+    0.0f, -1.0f, 0.0f, 0.0f,
+    0.0f,  0.0f, 0.0f, 0.0f,
+    0.0f,  1.0f, 0.0f, 0.0f,
+    0.0f,  0.0f, 0.0f, 0.0f
+};
+
+const FloatData TestData::s_Scores =
+{
+    0.0f, 0.90f, 0.80f,
+    0.0f, 0.75f, 0.72f,
+    0.0f, 0.60f, 0.50f,
+    0.0f, 0.93f, 0.95f,
+    0.0f, 0.50f, 0.40f,
+    0.0f, 0.30f, 0.20f
+};
+
+const FloatData TestData::s_Anchors =
+{
+    0.5f,   0.5f, 1.0f, 1.0f,
+    0.5f,   0.5f, 1.0f, 1.0f,
+    0.5f,   0.5f, 1.0f, 1.0f,
+    0.5f,  10.5f, 1.0f, 1.0f,
+    0.5f,  10.5f, 1.0f, 1.0f,
+    0.5f, 100.5f, 1.0f, 1.0f
+};
+
+const FloatData RegularNmsExpectedResults::s_DetectionBoxes =
+{
+    0.0f, 10.0f, 1.0f, 11.0f,
+    0.0f, 10.0f, 1.0f, 11.0f,
+    0.0f,  0.0f, 0.0f,  0.0f
+};
+
+const FloatData RegularNmsExpectedResults::s_DetectionScores =
+{
+    0.95f, 0.93f, 0.0f
+};
+
+const FloatData RegularNmsExpectedResults::s_DetectionClasses =
+{
+    1.0f, 0.0f, 0.0f
+};
+
+const FloatData RegularNmsExpectedResults::s_NumDetections = { 2.0f };
+
+const FloatData FastNmsExpectedResults::s_DetectionBoxes =
+{
+    0.0f,  10.0f, 1.0f,  11.0f,
+    0.0f,   0.0f, 1.0f,   1.0f,
+    0.0f, 100.0f, 1.0f, 101.0f
+};
+
+const FloatData FastNmsExpectedResults::s_DetectionScores =
+{
+    0.95f, 0.9f, 0.3f
+};
+
+const FloatData FastNmsExpectedResults::s_DetectionClasses =
+{
+    1.0f, 0.0f, 0.0f
+};
+
+const FloatData FastNmsExpectedResults::s_NumDetections = { 3.0f };
+
+} // anonymous namespace
+
+template<typename FactoryType,
+         armnn::DataType ArmnnType,
+         typename T = armnn::ResolveType<ArmnnType>>
 void DetectionPostProcessImpl(const armnn::TensorInfo& boxEncodingsInfo,
                               const armnn::TensorInfo& scoresInfo,
                               const armnn::TensorInfo& anchorsInfo,
@@ -110,254 +227,140 @@
     BOOST_TEST(CompareTensors(numDetectionsResult.output, numDetectionsResult.outputExpected));
 }
 
-inline void QuantizeData(uint8_t* quant, const float* dequant, const armnn::TensorInfo& info)
+template<armnn::DataType QuantizedType, typename RawType = armnn::ResolveType<QuantizedType>>
+void QuantizeData(RawType* quant, const float* dequant, const armnn::TensorInfo& info)
 {
     for (size_t i = 0; i < info.GetNumElements(); i++)
     {
-        quant[i] = armnn::Quantize<uint8_t>(dequant[i], info.GetQuantizationScale(), info.GetQuantizationOffset());
+        quant[i] = armnn::Quantize<RawType>(
+            dequant[i], info.GetQuantizationScale(), info.GetQuantizationOffset());
     }
 }
 
-template <typename FactoryType>
+template<typename FactoryType>
 void DetectionPostProcessRegularNmsFloatTest()
 {
-    armnn::TensorInfo boxEncodingsInfo({ 1, 6, 4 }, armnn::DataType::Float32);
-    armnn::TensorInfo scoresInfo({ 1, 6, 3}, armnn::DataType::Float32);
-    armnn::TensorInfo anchorsInfo({ 6, 4 }, armnn::DataType::Float32);
-
-    std::vector<float> boxEncodingsData({
-        0.0f, 0.0f, 0.0f, 0.0f,
-        0.0f, 1.0f, 0.0f, 0.0f,
-        0.0f, -1.0f, 0.0f, 0.0f,
-        0.0f, 0.0f, 0.0f, 0.0f,
-        0.0f, 1.0f, 0.0f, 0.0f,
-        0.0f, 0.0f, 0.0f, 0.0f
-    });
-    std::vector<float> scoresData({
-        0.0f, 0.9f, 0.8f,
-        0.0f, 0.75f, 0.72f,
-        0.0f, 0.6f, 0.5f,
-        0.0f, 0.93f, 0.95f,
-        0.0f, 0.5f, 0.4f,
-        0.0f, 0.3f, 0.2f
-    });
-    std::vector<float> anchorsData({
-        0.5f, 0.5f, 1.0f, 1.0f,
-        0.5f, 0.5f, 1.0f, 1.0f,
-        0.5f, 0.5f, 1.0f, 1.0f,
-        0.5f, 10.5f, 1.0f, 1.0f,
-        0.5f, 10.5f, 1.0f, 1.0f,
-        0.5f, 100.5f, 1.0f, 1.0f
-    });
-
-    std::vector<float> expectedDetectionBoxes({
-        0.0f, 10.0f, 1.0f, 11.0f,
-        0.0f, 10.0f, 1.0f, 11.0f,
-        0.0f, 0.0f, 0.0f, 0.0f
-    });
-    std::vector<float> expectedDetectionScores({ 0.95f, 0.93f, 0.0f });
-    std::vector<float> expectedDetectionClasses({ 1.0f, 0.0f, 0.0f });
-    std::vector<float> expectedNumDetections({ 2.0f });
-
-    return DetectionPostProcessImpl<FactoryType, armnn::DataType::Float32>(boxEncodingsInfo,
-                                                                           scoresInfo,
-                                                                           anchorsInfo,
-                                                                           boxEncodingsData,
-                                                                           scoresData,
-                                                                           anchorsData,
-                                                                           expectedDetectionBoxes,
-                                                                           expectedDetectionClasses,
-                                                                           expectedDetectionScores,
-                                                                           expectedNumDetections,
-                                                                           true);
+    return DetectionPostProcessImpl<FactoryType, armnn::DataType::Float32>(
+        armnn::TensorInfo(TestData::s_BoxEncodingsShape, armnn::DataType::Float32),
+        armnn::TensorInfo(TestData::s_ScoresShape, armnn::DataType::Float32),
+        armnn::TensorInfo(TestData::s_AnchorsShape, armnn::DataType::Float32),
+        TestData::s_BoxEncodings,
+        TestData::s_Scores,
+        TestData::s_Anchors,
+        RegularNmsExpectedResults::s_DetectionBoxes,
+        RegularNmsExpectedResults::s_DetectionClasses,
+        RegularNmsExpectedResults::s_DetectionScores,
+        RegularNmsExpectedResults::s_NumDetections,
+        true);
 }
 
-template <typename FactoryType>
-void DetectionPostProcessRegularNmsUint8Test()
+template<typename FactoryType,
+         armnn::DataType QuantizedType,
+         typename RawType = armnn::ResolveType<QuantizedType>>
+void DetectionPostProcessRegularNmsQuantizedTest()
 {
-    armnn::TensorInfo boxEncodingsInfo({ 1, 6, 4 }, armnn::DataType::QuantisedAsymm8);
-    armnn::TensorInfo scoresInfo({ 1, 6, 3 }, armnn::DataType::QuantisedAsymm8);
-    armnn::TensorInfo anchorsInfo({ 6, 4 }, armnn::DataType::QuantisedAsymm8);
+    armnn::TensorInfo boxEncodingsInfo(TestData::s_BoxEncodingsShape, QuantizedType);
+    armnn::TensorInfo scoresInfo(TestData::s_ScoresShape, QuantizedType);
+    armnn::TensorInfo anchorsInfo(TestData::s_AnchorsShape, QuantizedType);
 
-    boxEncodingsInfo.SetQuantizationScale(1.0f);
-    boxEncodingsInfo.SetQuantizationOffset(1);
-    scoresInfo.SetQuantizationScale(0.01f);
-    scoresInfo.SetQuantizationOffset(0);
-    anchorsInfo.SetQuantizationScale(0.5f);
-    anchorsInfo.SetQuantizationOffset(0);
+    boxEncodingsInfo.SetQuantizationScale(TestData::s_BoxEncodingsQuantData.first);
+    boxEncodingsInfo.SetQuantizationOffset(TestData::s_BoxEncodingsQuantData.second);
 
-    std::vector<float> boxEncodings({
-        0.0f, 0.0f, 0.0f, 0.0f,
-        0.0f, 1.0f, 0.0f, 0.0f,
-        0.0f, -1.0f, 0.0f, 0.0f,
-        0.0f, 0.0f, 0.0f, 0.0f,
-        0.0f, 1.0f, 0.0f, 0.0f,
-        0.0f, 0.0f, 0.0f, 0.0f
-    });
-    std::vector<float> scores({
-        0.0f, 0.9f, 0.8f,
-        0.0f, 0.75f, 0.72f,
-        0.0f, 0.6f, 0.5f,
-        0.0f, 0.93f, 0.95f,
-        0.0f, 0.5f, 0.4f,
-        0.0f, 0.3f, 0.2f
-    });
-    std::vector<float> anchors({
-        0.5f, 0.5f, 1.0f, 1.0f,
-        0.5f, 0.5f, 1.0f, 1.0f,
-        0.5f, 0.5f, 1.0f, 1.0f,
-        0.5f, 10.5f, 1.0f, 1.0f,
-        0.5f, 10.5f, 1.0f, 1.0f,
-        0.5f, 100.5f, 1.0f, 1.0f
-    });
+    scoresInfo.SetQuantizationScale(TestData::s_ScoresQuantData.first);
+    scoresInfo.SetQuantizationOffset(TestData::s_ScoresQuantData.second);
 
-    std::vector<uint8_t> boxEncodingsData(boxEncodings.size(), 0);
-    std::vector<uint8_t> scoresData(scores.size(), 0);
-    std::vector<uint8_t> anchorsData(anchors.size(), 0);
-    QuantizeData(boxEncodingsData.data(), boxEncodings.data(), boxEncodingsInfo);
-    QuantizeData(scoresData.data(), scores.data(), scoresInfo);
-    QuantizeData(anchorsData.data(), anchors.data(), anchorsInfo);
+    anchorsInfo.SetQuantizationScale(TestData::s_AnchorsQuantData.first);
+    anchorsInfo.SetQuantizationOffset(TestData::s_BoxEncodingsQuantData.second);
 
-    std::vector<float> expectedDetectionBoxes({
-        0.0f, 10.0f, 1.0f, 11.0f,
-        0.0f, 10.0f, 1.0f, 11.0f,
-        0.0f, 0.0f, 0.0f, 0.0f
-    });
-    std::vector<float> expectedDetectionScores({ 0.95f, 0.93f, 0.0f });
-    std::vector<float> expectedDetectionClasses({ 1.0f, 0.0f, 0.0f });
-    std::vector<float> expectedNumDetections({ 2.0f });
+    std::vector<RawType> boxEncodingsData(TestData::s_BoxEncodingsShape.GetNumElements());
+    QuantizeData<QuantizedType>(boxEncodingsData.data(),
+                                TestData::s_BoxEncodings.data(),
+                                boxEncodingsInfo);
 
-    return DetectionPostProcessImpl<FactoryType, armnn::DataType::QuantisedAsymm8>(boxEncodingsInfo,
-                                                                                   scoresInfo,
-                                                                                   anchorsInfo,
-                                                                                   boxEncodingsData,
-                                                                                   scoresData,
-                                                                                   anchorsData,
-                                                                                   expectedDetectionBoxes,
-                                                                                   expectedDetectionClasses,
-                                                                                   expectedDetectionScores,
-                                                                                   expectedNumDetections,
-                                                                                   true);
+    std::vector<RawType> scoresData(TestData::s_ScoresShape.GetNumElements());
+    QuantizeData<QuantizedType>(scoresData.data(),
+                                TestData::s_Scores.data(),
+                                scoresInfo);
+
+    std::vector<RawType> anchorsData(TestData::s_AnchorsShape.GetNumElements());
+    QuantizeData<QuantizedType>(anchorsData.data(),
+                                TestData::s_Anchors.data(),
+                                anchorsInfo);
+
+    return DetectionPostProcessImpl<FactoryType, QuantizedType>(
+        boxEncodingsInfo,
+        scoresInfo,
+        anchorsInfo,
+        boxEncodingsData,
+        scoresData,
+        anchorsData,
+        RegularNmsExpectedResults::s_DetectionBoxes,
+        RegularNmsExpectedResults::s_DetectionClasses,
+        RegularNmsExpectedResults::s_DetectionScores,
+        RegularNmsExpectedResults::s_NumDetections,
+        true);
 }
 
-template <typename FactoryType>
+template<typename FactoryType>
 void DetectionPostProcessFastNmsFloatTest()
 {
-    armnn::TensorInfo boxEncodingsInfo({ 1, 6, 4 }, armnn::DataType::Float32);
-    armnn::TensorInfo scoresInfo({ 1, 6, 3}, armnn::DataType::Float32);
-    armnn::TensorInfo anchorsInfo({ 6, 4 }, armnn::DataType::Float32);
-
-    std::vector<float> boxEncodingsData({
-        0.0f, 0.0f, 0.0f, 0.0f,
-        0.0f, 1.0f, 0.0f, 0.0f,
-        0.0f, -1.0f, 0.0f, 0.0f,
-        0.0f, 0.0f, 0.0f, 0.0f,
-        0.0f, 1.0f, 0.0f, 0.0f,
-        0.0f, 0.0f, 0.0f, 0.0f
-    });
-    std::vector<float> scoresData({
-        0.0f, 0.9f, 0.8f,
-        0.0f, 0.75f, 0.72f,
-        0.0f, 0.6f, 0.5f,
-        0.0f, 0.93f, 0.95f,
-        0.0f, 0.5f, 0.4f,
-        0.0f, 0.3f, 0.2f
-    });
-    std::vector<float> anchorsData({
-        0.5f, 0.5f, 1.0f, 1.0f,
-        0.5f, 0.5f, 1.0f, 1.0f,
-        0.5f, 0.5f, 1.0f, 1.0f,
-        0.5f, 10.5f, 1.0f, 1.0f,
-        0.5f, 10.5f, 1.0f, 1.0f,
-        0.5f, 100.5f, 1.0f, 1.0f
-    });
-
-    std::vector<float> expectedDetectionBoxes({
-        0.0f, 10.0f, 1.0f, 11.0f,
-        0.0f, 0.0f, 1.0f, 1.0f,
-        0.0f, 100.0f, 1.0f, 101.0f
-    });
-    std::vector<float> expectedDetectionScores({ 0.95f, 0.9f, 0.3f });
-    std::vector<float> expectedDetectionClasses({ 1.0f, 0.0f, 0.0f });
-    std::vector<float> expectedNumDetections({ 3.0f });
-
-    return DetectionPostProcessImpl<FactoryType, armnn::DataType::Float32>(boxEncodingsInfo,
-                                                                           scoresInfo,
-                                                                           anchorsInfo,
-                                                                           boxEncodingsData,
-                                                                           scoresData,
-                                                                           anchorsData,
-                                                                           expectedDetectionBoxes,
-                                                                           expectedDetectionClasses,
-                                                                           expectedDetectionScores,
-                                                                           expectedNumDetections,
-                                                                           false);
+    return DetectionPostProcessImpl<FactoryType, armnn::DataType::Float32>(
+        armnn::TensorInfo(TestData::s_BoxEncodingsShape, armnn::DataType::Float32),
+        armnn::TensorInfo(TestData::s_ScoresShape, armnn::DataType::Float32),
+        armnn::TensorInfo(TestData::s_AnchorsShape, armnn::DataType::Float32),
+        TestData::s_BoxEncodings,
+        TestData::s_Scores,
+        TestData::s_Anchors,
+        FastNmsExpectedResults::s_DetectionBoxes,
+        FastNmsExpectedResults::s_DetectionClasses,
+        FastNmsExpectedResults::s_DetectionScores,
+        FastNmsExpectedResults::s_NumDetections,
+        false);
 }
 
-template <typename FactoryType>
-void DetectionPostProcessFastNmsUint8Test()
+template<typename FactoryType,
+         armnn::DataType QuantizedType,
+         typename RawType = armnn::ResolveType<QuantizedType>>
+void DetectionPostProcessFastNmsQuantizedTest()
 {
-    armnn::TensorInfo boxEncodingsInfo({ 1, 6, 4 }, armnn::DataType::QuantisedAsymm8);
-    armnn::TensorInfo scoresInfo({ 1, 6, 3 }, armnn::DataType::QuantisedAsymm8);
-    armnn::TensorInfo anchorsInfo({ 6, 4 }, armnn::DataType::QuantisedAsymm8);
+    armnn::TensorInfo boxEncodingsInfo(TestData::s_BoxEncodingsShape, QuantizedType);
+    armnn::TensorInfo scoresInfo(TestData::s_ScoresShape, QuantizedType);
+    armnn::TensorInfo anchorsInfo(TestData::s_AnchorsShape, QuantizedType);
 
-    boxEncodingsInfo.SetQuantizationScale(1.0f);
-    boxEncodingsInfo.SetQuantizationOffset(1);
-    scoresInfo.SetQuantizationScale(0.01f);
-    scoresInfo.SetQuantizationOffset(0);
-    anchorsInfo.SetQuantizationScale(0.5f);
-    anchorsInfo.SetQuantizationOffset(0);
+    boxEncodingsInfo.SetQuantizationScale(TestData::s_BoxEncodingsQuantData.first);
+    boxEncodingsInfo.SetQuantizationOffset(TestData::s_BoxEncodingsQuantData.second);
 
-    std::vector<float> boxEncodings({
-        0.0f, 0.0f, 0.0f, 0.0f,
-        0.0f, 1.0f, 0.0f, 0.0f,
-        0.0f, -1.0f, 0.0f, 0.0f,
-        0.0f, 0.0f, 0.0f, 0.0f,
-        0.0f, 1.0f, 0.0f, 0.0f,
-        0.0f, 0.0f, 0.0f, 0.0f
-    });
-    std::vector<float> scores({
-        0.0f, 0.9f, 0.8f,
-        0.0f, 0.75f, 0.72f,
-        0.0f, 0.6f, 0.5f,
-        0.0f, 0.93f, 0.95f,
-        0.0f, 0.5f, 0.4f,
-        0.0f, 0.3f, 0.2f
-    });
-    std::vector<float> anchors({
-        0.5f, 0.5f, 1.0f, 1.0f,
-        0.5f, 0.5f, 1.0f, 1.0f,
-        0.5f, 0.5f, 1.0f, 1.0f,
-        0.5f, 10.5f, 1.0f, 1.0f,
-        0.5f, 10.5f, 1.0f, 1.0f,
-        0.5f, 100.5f, 1.0f, 1.0f
-    });
+    scoresInfo.SetQuantizationScale(TestData::s_ScoresQuantData.first);
+    scoresInfo.SetQuantizationOffset(TestData::s_ScoresQuantData.second);
 
-    std::vector<uint8_t> boxEncodingsData(boxEncodings.size(), 0);
-    std::vector<uint8_t> scoresData(scores.size(), 0);
-    std::vector<uint8_t> anchorsData(anchors.size(), 0);
-    QuantizeData(boxEncodingsData.data(), boxEncodings.data(), boxEncodingsInfo);
-    QuantizeData(scoresData.data(), scores.data(), scoresInfo);
-    QuantizeData(anchorsData.data(), anchors.data(), anchorsInfo);
+    anchorsInfo.SetQuantizationScale(TestData::s_AnchorsQuantData.first);
+    anchorsInfo.SetQuantizationOffset(TestData::s_BoxEncodingsQuantData.second);
 
-    std::vector<float> expectedDetectionBoxes({
-        0.0f, 10.0f, 1.0f, 11.0f,
-        0.0f, 0.0f, 1.0f, 1.0f,
-        0.0f, 100.0f, 1.0f, 101.0f
-    });
-    std::vector<float> expectedDetectionScores({ 0.95f, 0.9f, 0.3f });
-    std::vector<float> expectedDetectionClasses({ 1.0f, 0.0f, 0.0f });
-    std::vector<float> expectedNumDetections({ 3.0f });
+    std::vector<RawType> boxEncodingsData(TestData::s_BoxEncodingsShape.GetNumElements());
+    QuantizeData<QuantizedType>(boxEncodingsData.data(),
+                                TestData::s_BoxEncodings.data(),
+                                boxEncodingsInfo);
 
-    return DetectionPostProcessImpl<FactoryType, armnn::DataType::QuantisedAsymm8>(boxEncodingsInfo,
-                                                                                   scoresInfo,
-                                                                                   anchorsInfo,
-                                                                                   boxEncodingsData,
-                                                                                   scoresData,
-                                                                                   anchorsData,
-                                                                                   expectedDetectionBoxes,
-                                                                                   expectedDetectionClasses,
-                                                                                   expectedDetectionScores,
-                                                                                   expectedNumDetections,
-                                                                                   false);
-}
+    std::vector<RawType> scoresData(TestData::s_ScoresShape.GetNumElements());
+    QuantizeData<QuantizedType>(scoresData.data(),
+                                TestData::s_Scores.data(),
+                                scoresInfo);
+
+    std::vector<RawType> anchorsData(TestData::s_AnchorsShape.GetNumElements());
+    QuantizeData<QuantizedType>(anchorsData.data(),
+                                TestData::s_Anchors.data(),
+                                anchorsInfo);
+
+    return DetectionPostProcessImpl<FactoryType, QuantizedType>(
+        boxEncodingsInfo,
+        scoresInfo,
+        anchorsInfo,
+        boxEncodingsData,
+        scoresData,
+        anchorsData,
+        FastNmsExpectedResults::s_DetectionBoxes,
+        FastNmsExpectedResults::s_DetectionClasses,
+        FastNmsExpectedResults::s_DetectionScores,
+        FastNmsExpectedResults::s_NumDetections,
+        false);
+}
\ No newline at end of file
diff --git a/src/backends/reference/RefWorkloadFactory.cpp b/src/backends/reference/RefWorkloadFactory.cpp
index d103f56..5e247b2 100644
--- a/src/backends/reference/RefWorkloadFactory.cpp
+++ b/src/backends/reference/RefWorkloadFactory.cpp
@@ -179,16 +179,7 @@
 std::unique_ptr<IWorkload> RefWorkloadFactory::CreateDetectionPostProcess(
     const armnn::DetectionPostProcessQueueDescriptor& descriptor, const armnn::WorkloadInfo& info) const
 {
-    const DataType dataType = info.m_InputTensorInfos[0].GetDataType();
-    switch (dataType)
-    {
-        case DataType::Float32:
-            return std::make_unique<RefDetectionPostProcessFloat32Workload>(descriptor, info);
-        case DataType::QuantisedAsymm8:
-            return std::make_unique<RefDetectionPostProcessUint8Workload>(descriptor, info);
-        default:
-            return MakeWorkload<NullWorkload, NullWorkload>(descriptor, info);
-    }
+    return std::make_unique<RefDetectionPostProcessWorkload>(descriptor, info);
 }
 
 std::unique_ptr<armnn::IWorkload> RefWorkloadFactory::CreateNormalization(
diff --git a/src/backends/reference/backend.mk b/src/backends/reference/backend.mk
index 81b6de1..edf1431 100644
--- a/src/backends/reference/backend.mk
+++ b/src/backends/reference/backend.mk
@@ -37,8 +37,7 @@
         workloads/RefDebugWorkload.cpp \
         workloads/RefDepthwiseConvolution2dWorkload.cpp \
         workloads/RefDequantizeWorkload.cpp \
-        workloads/RefDetectionPostProcessFloat32Workload.cpp \
-        workloads/RefDetectionPostProcessUint8Workload.cpp \
+        workloads/RefDetectionPostProcessWorkload.cpp \
         workloads/RefElementwiseWorkload.cpp \
         workloads/RefFakeQuantizationFloat32Workload.cpp \
         workloads/RefFloorWorkload.cpp \
diff --git a/src/backends/reference/test/RefDetectionPostProcessTests.cpp b/src/backends/reference/test/RefDetectionPostProcessTests.cpp
index a9faff7..fab6e00 100644
--- a/src/backends/reference/test/RefDetectionPostProcessTests.cpp
+++ b/src/backends/reference/test/RefDetectionPostProcessTests.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: MIT
 //
 
-#include "reference/workloads/DetectionPostProcess.cpp"
+#include <reference/workloads/DetectionPostProcess.hpp>
 
 #include <armnn/Descriptors.hpp>
 #include <armnn/Types.hpp>
@@ -12,13 +12,12 @@
 
 BOOST_AUTO_TEST_SUITE(RefDetectionPostProcess)
 
-
 BOOST_AUTO_TEST_CASE(TopKSortTest)
 {
     unsigned int k = 3;
     unsigned int indices[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
     float values[8] = { 0, 7, 6, 5, 4, 3, 2, 500 };
-    TopKSort(k, indices, values, 8);
+    armnn::TopKSort(k, indices, values, 8);
     BOOST_TEST(indices[0] == 7);
     BOOST_TEST(indices[1] == 1);
     BOOST_TEST(indices[2] == 2);
@@ -29,7 +28,7 @@
     unsigned int k = 8;
     unsigned int indices[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
     float values[8] = { 0, 7, 6, 5, 4, 3, 2, 500 };
-    TopKSort(k, indices, values, 8);
+    armnn::TopKSort(k, indices, values, 8);
     BOOST_TEST(indices[0] == 7);
     BOOST_TEST(indices[1] == 1);
     BOOST_TEST(indices[2] == 2);
@@ -44,7 +43,7 @@
 {
     float boxI[4] = { 0.0f, 0.0f, 10.0f, 10.0f };
     float boxJ[4] = { 1.0f, 1.0f, 11.0f, 11.0f };
-    float iou = IntersectionOverUnion(boxI, boxJ);
+    float iou = armnn::IntersectionOverUnion(boxI, boxJ);
     BOOST_TEST(iou == 0.68, boost::test_tools::tolerance(0.001));
 }
 
@@ -61,14 +60,17 @@
 
     std::vector<float> scores({ 0.9f, 0.75f, 0.6f, 0.93f, 0.5f, 0.3f });
 
-    std::vector<unsigned int> result = NonMaxSuppression(6, boxCorners, scores, 0.0, 3, 0.5);
+    std::vector<unsigned int> result =
+        armnn::NonMaxSuppression(6, boxCorners, scores, 0.0, 3, 0.5);
+
     BOOST_TEST(result.size() == 3);
     BOOST_TEST(result[0] == 3);
     BOOST_TEST(result[1] == 0);
     BOOST_TEST(result[2] == 5);
 }
 
-void DetectionPostProcessTestImpl(bool useRegularNms, const std::vector<float>& expectedDetectionBoxes,
+void DetectionPostProcessTestImpl(bool useRegularNms,
+                                  const std::vector<float>& expectedDetectionBoxes,
                                   const std::vector<float>& expectedDetectionClasses,
                                   const std::vector<float>& expectedDetectionScores,
                                   const std::vector<float>& expectedNumDetections)
@@ -103,6 +105,7 @@
         0.0f, 1.0f, 0.0f, 0.0f,
         0.0f, 0.0f, 0.0f, 0.0f
     });
+
     std::vector<float> scores({
         0.0f, 0.9f, 0.8f,
         0.0f, 0.75f, 0.72f,
@@ -111,6 +114,7 @@
         0.0f, 0.5f, 0.4f,
         0.0f, 0.3f, 0.2f
     });
+
     std::vector<float> anchors({
         0.5f, 0.5f, 1.0f, 1.0f,
         0.5f, 0.5f, 1.0f, 1.0f,
@@ -120,22 +124,50 @@
         0.5f, 100.5f, 1.0f, 1.0f
     });
 
+    auto boxEncodingsDecoder = armnn::MakeDecoder<float>(boxEncodingsInfo, boxEncodings.data());
+    auto scoresDecoder       = armnn::MakeDecoder<float>(scoresInfo, scores.data());
+    auto anchorsDecoder      = armnn::MakeDecoder<float>(anchorsInfo, anchors.data());
+
     std::vector<float> detectionBoxes(detectionBoxesInfo.GetNumElements());
     std::vector<float> detectionScores(detectionScoresInfo.GetNumElements());
     std::vector<float> detectionClasses(detectionClassesInfo.GetNumElements());
     std::vector<float> numDetections(1);
 
-    armnn::DetectionPostProcess(boxEncodingsInfo, scoresInfo, anchorsInfo,
-                                detectionBoxesInfo, detectionClassesInfo,
-                                detectionScoresInfo, numDetectionInfo, desc,
-                                boxEncodings.data(), scores.data(), anchors.data(),
-                                detectionBoxes.data(), detectionClasses.data(),
-                                detectionScores.data(), numDetections.data());
+    armnn::DetectionPostProcess(boxEncodingsInfo,
+                                scoresInfo,
+                                anchorsInfo,
+                                detectionBoxesInfo,
+                                detectionClassesInfo,
+                                detectionScoresInfo,
+                                numDetectionInfo,
+                                desc,
+                                *boxEncodingsDecoder,
+                                *scoresDecoder,
+                                *anchorsDecoder,
+                                detectionBoxes.data(),
+                                detectionClasses.data(),
+                                detectionScores.data(),
+                                numDetections.data());
 
-    BOOST_TEST(detectionBoxes == expectedDetectionBoxes);
-    BOOST_TEST(detectionScores == expectedDetectionScores);
-    BOOST_TEST(detectionClasses == expectedDetectionClasses);
-    BOOST_TEST(numDetections == expectedNumDetections);
+    BOOST_CHECK_EQUAL_COLLECTIONS(detectionBoxes.begin(),
+                                  detectionBoxes.end(),
+                                  expectedDetectionBoxes.begin(),
+                                  expectedDetectionBoxes.end());
+
+    BOOST_CHECK_EQUAL_COLLECTIONS(detectionScores.begin(),
+                                  detectionScores.end(),
+                                  expectedDetectionScores.begin(),
+                                  expectedDetectionScores.end());
+
+    BOOST_CHECK_EQUAL_COLLECTIONS(detectionClasses.begin(),
+                                  detectionClasses.end(),
+                                  expectedDetectionClasses.begin(),
+                                  expectedDetectionClasses.end());
+
+    BOOST_CHECK_EQUAL_COLLECTIONS(numDetections.begin(),
+                                  numDetections.end(),
+                                  expectedNumDetections.begin(),
+                                  expectedNumDetections.end());
 }
 
 BOOST_AUTO_TEST_CASE(RegularNmsDetectionPostProcess)
diff --git a/src/backends/reference/test/RefLayerTests.cpp b/src/backends/reference/test/RefLayerTests.cpp
index b2f71a8..f54a8d0 100644
--- a/src/backends/reference/test/RefLayerTests.cpp
+++ b/src/backends/reference/test/RefLayerTests.cpp
@@ -624,11 +624,23 @@
 }
 BOOST_AUTO_TEST_CASE(DetectionPostProcessRegularNmsUint8)
 {
-    DetectionPostProcessRegularNmsUint8Test<armnn::RefWorkloadFactory>();
+    DetectionPostProcessRegularNmsQuantizedTest<
+        armnn::RefWorkloadFactory, armnn::DataType::QuantisedAsymm8>();
 }
 BOOST_AUTO_TEST_CASE(DetectionPostProcessFastNmsUint8)
 {
-    DetectionPostProcessFastNmsUint8Test<armnn::RefWorkloadFactory>();
+    DetectionPostProcessRegularNmsQuantizedTest<
+        armnn::RefWorkloadFactory, armnn::DataType::QuantisedAsymm8>();
+}
+BOOST_AUTO_TEST_CASE(DetectionPostProcessRegularNmsInt16)
+{
+    DetectionPostProcessRegularNmsQuantizedTest<
+        armnn::RefWorkloadFactory, armnn::DataType::QuantisedSymm16>();
+}
+BOOST_AUTO_TEST_CASE(DetectionPostProcessFastNmsInt16)
+{
+    DetectionPostProcessFastNmsQuantizedTest<
+        armnn::RefWorkloadFactory, armnn::DataType::QuantisedSymm16>();
 }
 
 // Dequantize
diff --git a/src/backends/reference/workloads/CMakeLists.txt b/src/backends/reference/workloads/CMakeLists.txt
index cdca22d..25d4b28 100644
--- a/src/backends/reference/workloads/CMakeLists.txt
+++ b/src/backends/reference/workloads/CMakeLists.txt
@@ -60,10 +60,8 @@
     RefDepthwiseConvolution2dWorkload.hpp
     RefDequantizeWorkload.cpp
     RefDequantizeWorkload.hpp
-    RefDetectionPostProcessUint8Workload.cpp
-    RefDetectionPostProcessUint8Workload.hpp
-    RefDetectionPostProcessFloat32Workload.cpp
-    RefDetectionPostProcessFloat32Workload.hpp
+    RefDetectionPostProcessWorkload.cpp
+    RefDetectionPostProcessWorkload.hpp
     RefFakeQuantizationFloat32Workload.cpp
     RefFakeQuantizationFloat32Workload.hpp
     RefFloorWorkload.cpp
diff --git a/src/backends/reference/workloads/DetectionPostProcess.cpp b/src/backends/reference/workloads/DetectionPostProcess.cpp
index d3790f2..d475dd8 100644
--- a/src/backends/reference/workloads/DetectionPostProcess.cpp
+++ b/src/backends/reference/workloads/DetectionPostProcess.cpp
@@ -13,7 +13,7 @@
 #include <algorithm>
 #include <numeric>
 
-namespace
+namespace armnn
 {
 
 std::vector<unsigned int> GenerateRangeK(unsigned int k)
@@ -48,9 +48,12 @@
     return areaIntersection / areaUnion;
 }
 
-std::vector<unsigned int> NonMaxSuppression(unsigned int numBoxes, const std::vector<float>& boxCorners,
-                                            const std::vector<float>& scores, float nmsScoreThreshold,
-                                            unsigned int maxDetection, float nmsIouThreshold)
+std::vector<unsigned int> NonMaxSuppression(unsigned int numBoxes,
+                                            const std::vector<float>& boxCorners,
+                                            const std::vector<float>& scores,
+                                            float nmsScoreThreshold,
+                                            unsigned int maxDetection,
+                                            float nmsIouThreshold)
 {
     // Select boxes that have scores above a given threshold.
     std::vector<float> scoresAboveThreshold;
@@ -67,7 +70,7 @@
     // Sort the indices based on scores.
     unsigned int numAboveThreshold = boost::numeric_cast<unsigned int>(scoresAboveThreshold.size());
     std::vector<unsigned int> sortedIndices = GenerateRangeK(numAboveThreshold);
-    TopKSort(numAboveThreshold,sortedIndices.data(), scoresAboveThreshold.data(), numAboveThreshold);
+    TopKSort(numAboveThreshold, sortedIndices.data(), scoresAboveThreshold.data(), numAboveThreshold);
 
     // Number of output cannot be more than max detections specified in the option.
     unsigned int numOutput = std::min(maxDetection, numAboveThreshold);
@@ -98,10 +101,17 @@
     return outputIndices;
 }
 
-void AllocateOutputData(unsigned int numOutput, unsigned int numSelected, const std::vector<float>& boxCorners,
-                        const std::vector<unsigned int>& outputIndices, const std::vector<unsigned int>& selectedBoxes,
-                        const std::vector<unsigned int>& selectedClasses, const std::vector<float>& selectedScores,
-                        float* detectionBoxes, float* detectionScores, float* detectionClasses, float* numDetections)
+void AllocateOutputData(unsigned int numOutput,
+                        unsigned int numSelected,
+                        const std::vector<float>& boxCorners,
+                        const std::vector<unsigned int>& outputIndices,
+                        const std::vector<unsigned int>& selectedBoxes,
+                        const std::vector<unsigned int>& selectedClasses,
+                        const std::vector<float>& selectedScores,
+                        float* detectionBoxes,
+                        float* detectionScores,
+                        float* detectionClasses,
+                        float* numDetections)
 {
     for (unsigned int i = 0; i < numOutput; ++i)
         {
@@ -129,11 +139,6 @@
         numDetections[0] = boost::numeric_cast<float>(numSelected);
 }
 
-} // anonymous namespace
-
-namespace armnn
-{
-
 void DetectionPostProcess(const TensorInfo& boxEncodingsInfo,
                           const TensorInfo& scoresInfo,
                           const TensorInfo& anchorsInfo,
@@ -142,9 +147,9 @@
                           const TensorInfo& detectionScoresInfo,
                           const TensorInfo& numDetectionsInfo,
                           const DetectionPostProcessDescriptor& desc,
-                          const float* boxEncodings,
-                          const float* scores,
-                          const float* anchors,
+                          Decoder<float>& boxEncodings,
+                          Decoder<float>& scores,
+                          Decoder<float>& anchors,
                           float* detectionBoxes,
                           float* detectionClasses,
                           float* detectionScores,
@@ -153,17 +158,51 @@
     // Transform center-size format which is (ycenter, xcenter, height, width) to box-corner format,
     // which represents the lower left corner and the upper right corner (ymin, xmin, ymax, xmax)
     std::vector<float> boxCorners(boxEncodingsInfo.GetNumElements());
-    unsigned int numBoxes = boxEncodingsInfo.GetShape()[1];
+
+    const unsigned int numBoxes  = boxEncodingsInfo.GetShape()[1];
+    const unsigned int numScores = scoresInfo.GetNumElements();
+
     for (unsigned int i = 0; i < numBoxes; ++i)
     {
+        // Y
+        float boxEncodingY = boxEncodings.Get();
+        float anchorY      = anchors.Get();
+
+        ++boxEncodings;
+        ++anchors;
+
+        // X
+        float boxEncodingX = boxEncodings.Get();
+        float anchorX      = anchors.Get();
+
+        ++boxEncodings;
+        ++anchors;
+
+        // H
+        float boxEncodingH = boxEncodings.Get();
+        float anchorH      = anchors.Get();
+
+        ++boxEncodings;
+        ++anchors;
+
+        // W
+        float boxEncodingW = boxEncodings.Get();
+        float anchorW      = anchors.Get();
+
+        ++boxEncodings;
+        ++anchors;
+
+        float yCentre = boxEncodingY / desc.m_ScaleY * anchorH + anchorY;
+        float xCentre = boxEncodingX / desc.m_ScaleX * anchorW + anchorX;
+
+        float halfH = 0.5f * expf(boxEncodingH / desc.m_ScaleH) * anchorH;
+        float halfW = 0.5f * expf(boxEncodingW / desc.m_ScaleW) * anchorW;
+
         unsigned int indexY = i * 4;
         unsigned int indexX = indexY + 1;
         unsigned int indexH = indexX + 1;
         unsigned int indexW = indexH + 1;
-        float yCentre = boxEncodings[indexY] / desc.m_ScaleY * anchors[indexH] + anchors[indexY];
-        float xCentre = boxEncodings[indexX] / desc.m_ScaleX * anchors[indexW] + anchors[indexX];
-        float halfH = 0.5f * expf(boxEncodings[indexH] / desc.m_ScaleH) * anchors[indexH];
-        float halfW = 0.5f * expf(boxEncodings[indexW] / desc.m_ScaleW) * anchors[indexW];
+
         // ymin
         boxCorners[indexY] = yCentre - halfH;
         // xmin
@@ -179,14 +218,29 @@
 
     unsigned int numClassesWithBg = desc.m_NumClasses + 1;
 
+    // Decode scores
+    std::vector<float> decodedScores;
+    decodedScores.reserve(numScores);
+
+    for (unsigned int i = 0u; i < numScores; ++i)
+    {
+        decodedScores.emplace_back(scores.Get());
+        ++scores;
+    }
+
     // Perform Non Max Suppression.
     if (desc.m_UseRegularNms)
     {
         // Perform Regular NMS.
         // For each class, perform NMS and select max detection numbers of the highest score across all classes.
         std::vector<float> classScores(numBoxes);
-        std::vector<unsigned int>selectedBoxesAfterNms;
+
+        std::vector<unsigned int> selectedBoxesAfterNms;
+        selectedBoxesAfterNms.reserve(numBoxes);
+
         std::vector<float> selectedScoresAfterNms;
+        selectedBoxesAfterNms.reserve(numScores);
+
         std::vector<unsigned int> selectedClasses;
 
         for (unsigned int c = 0; c < desc.m_NumClasses; ++c)
@@ -194,9 +248,11 @@
             // For each boxes, get scores of the boxes for the class c.
             for (unsigned int i = 0; i < numBoxes; ++i)
             {
-                classScores[i] = scores[i * numClassesWithBg + c + 1];
+                classScores[i] = decodedScores[i * numClassesWithBg + c + 1];
             }
-            std::vector<unsigned int> selectedIndices = NonMaxSuppression(numBoxes, boxCorners, classScores,
+            std::vector<unsigned int> selectedIndices = NonMaxSuppression(numBoxes,
+                                                                          boxCorners,
+                                                                          classScores,
                                                                           desc.m_NmsScoreThreshold,
                                                                           desc.m_DetectionsPerClass,
                                                                           desc.m_NmsIouThreshold);
@@ -237,11 +293,12 @@
 
             // Get the max scores of the box.
             std::vector<unsigned int> maxScoreIndices = GenerateRangeK(desc.m_NumClasses);
-            TopKSort(numClassesPerBox, maxScoreIndices.data(), scores + scoreIndex, desc.m_NumClasses);
+            TopKSort(numClassesPerBox, maxScoreIndices.data(),
+                decodedScores.data() + scoreIndex, desc.m_NumClasses);
 
             for (unsigned int i = 0; i < numClassesPerBox; ++i)
             {
-                maxScores.push_back(scores[scoreIndex + maxScoreIndices[i]]);
+                maxScores.push_back(decodedScores[scoreIndex + maxScoreIndices[i]]);
                 maxScoreClasses.push_back(maxScoreIndices[i]);
                 boxIndices.push_back(box);
             }
diff --git a/src/backends/reference/workloads/DetectionPostProcess.hpp b/src/backends/reference/workloads/DetectionPostProcess.hpp
index 06e9e15..8700a53 100644
--- a/src/backends/reference/workloads/DetectionPostProcess.hpp
+++ b/src/backends/reference/workloads/DetectionPostProcess.hpp
@@ -7,6 +7,10 @@
 #include "armnn/Tensor.hpp"
 #include "armnn/Descriptors.hpp"
 
+#include "Decoders.hpp"
+
+#include <vector>
+
 namespace armnn
 {
 
@@ -18,12 +22,26 @@
                           const TensorInfo& detectionScoresInfo,
                           const TensorInfo& numDetectionsInfo,
                           const DetectionPostProcessDescriptor& desc,
-                          const float* boxEncodings,
-                          const float* scores,
-                          const float* anchors,
+                          Decoder<float>& boxEncodings,
+                          Decoder<float>& scores,
+                          Decoder<float>& anchors,
                           float* detectionBoxes,
                           float* detectionClasses,
                           float* detectionScores,
                           float* numDetections);
 
+void TopKSort(unsigned int k,
+              unsigned int* indices,
+              const float* values,
+              unsigned int numElement);
+
+float IntersectionOverUnion(const float* boxI, const float* boxJ);
+
+std::vector<unsigned int> NonMaxSuppression(unsigned int numBoxes,
+                                            const std::vector<float>& boxCorners,
+                                            const std::vector<float>& scores,
+                                            float nmsScoreThreshold,
+                                            unsigned int maxDetection,
+                                            float nmsIouThreshold);
+
 } // namespace armnn
diff --git a/src/backends/reference/workloads/RefDetectionPostProcessFloat32Workload.cpp b/src/backends/reference/workloads/RefDetectionPostProcessFloat32Workload.cpp
deleted file mode 100644
index ddab046..0000000
--- a/src/backends/reference/workloads/RefDetectionPostProcessFloat32Workload.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-//
-// Copyright © 2017 Arm Ltd. All rights reserved.
-// SPDX-License-Identifier: MIT
-//
-
-#include "RefDetectionPostProcessFloat32Workload.hpp"
-
-#include "DetectionPostProcess.hpp"
-#include "Profiling.hpp"
-#include "RefWorkloadUtils.hpp"
-
-namespace armnn
-{
-
-RefDetectionPostProcessFloat32Workload::RefDetectionPostProcessFloat32Workload(
-        const DetectionPostProcessQueueDescriptor& descriptor, const WorkloadInfo& info)
-        : Float32Workload<DetectionPostProcessQueueDescriptor>(descriptor, info),
-          m_Anchors(std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Anchors))) {}
-
-void RefDetectionPostProcessFloat32Workload::Execute() const
-{
-    ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefDetectionPostProcessUint8Workload_Execute");
-
-    const TensorInfo& boxEncodingsInfo = GetTensorInfo(m_Data.m_Inputs[0]);
-    const TensorInfo& scoresInfo = GetTensorInfo(m_Data.m_Inputs[1]);
-    const TensorInfo& anchorsInfo = GetTensorInfo(m_Anchors.get());
-    const TensorInfo& detectionBoxesInfo = GetTensorInfo(m_Data.m_Outputs[0]);
-    const TensorInfo& detectionClassesInfo = GetTensorInfo(m_Data.m_Outputs[1]);
-    const TensorInfo& detectionScoresInfo = GetTensorInfo(m_Data.m_Outputs[2]);
-    const TensorInfo& numDetectionsInfo = GetTensorInfo(m_Data.m_Outputs[3]);
-
-    const float* boxEncodings = GetInputTensorDataFloat(0, m_Data);
-    const float* scores = GetInputTensorDataFloat(1, m_Data);
-    const float* anchors = m_Anchors->GetConstTensor<float>();
-
-    float* detectionBoxes = GetOutputTensorData<float>(0, m_Data);
-    float* detectionClasses = GetOutputTensorData<float>(1, m_Data);
-    float* detectionScores = GetOutputTensorData<float>(2, m_Data);
-    float* numDetections = GetOutputTensorData<float>(3, m_Data);
-
-    DetectionPostProcess(boxEncodingsInfo, scoresInfo, anchorsInfo,
-                         detectionBoxesInfo, detectionClassesInfo,
-                         detectionScoresInfo, numDetectionsInfo, m_Data.m_Parameters,
-                         boxEncodings, scores, anchors, detectionBoxes,
-                         detectionClasses, detectionScores, numDetections);
-}
-
-} //namespace armnn
diff --git a/src/backends/reference/workloads/RefDetectionPostProcessFloat32Workload.hpp b/src/backends/reference/workloads/RefDetectionPostProcessFloat32Workload.hpp
deleted file mode 100644
index 9f2a697..0000000
--- a/src/backends/reference/workloads/RefDetectionPostProcessFloat32Workload.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-//
-// Copyright © 2017 Arm Ltd. All rights reserved.
-// SPDX-License-Identifier: MIT
-//
-
-#pragma once
-
-#include <backendsCommon/Workload.hpp>
-#include <backendsCommon/WorkloadData.hpp>
-
-namespace armnn
-{
-
-class RefDetectionPostProcessFloat32Workload : public Float32Workload<DetectionPostProcessQueueDescriptor>
-{
-public:
-    explicit RefDetectionPostProcessFloat32Workload(const DetectionPostProcessQueueDescriptor& descriptor,
-                                                    const WorkloadInfo& info);
-    virtual void Execute() const override;
-
-private:
-    std::unique_ptr<ScopedCpuTensorHandle> m_Anchors;
-};
-
-} //namespace armnn
diff --git a/src/backends/reference/workloads/RefDetectionPostProcessUint8Workload.cpp b/src/backends/reference/workloads/RefDetectionPostProcessUint8Workload.cpp
deleted file mode 100644
index ccdaf87..0000000
--- a/src/backends/reference/workloads/RefDetectionPostProcessUint8Workload.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-//
-// Copyright © 2017 Arm Ltd. All rights reserved.
-// SPDX-License-Identifier: MIT
-//
-
-#include "RefDetectionPostProcessUint8Workload.hpp"
-
-#include "DetectionPostProcess.hpp"
-#include "Profiling.hpp"
-#include "RefWorkloadUtils.hpp"
-
-namespace armnn
-{
-
-RefDetectionPostProcessUint8Workload::RefDetectionPostProcessUint8Workload(
-        const DetectionPostProcessQueueDescriptor& descriptor, const WorkloadInfo& info)
-        : Uint8ToFloat32Workload<DetectionPostProcessQueueDescriptor>(descriptor, info),
-          m_Anchors(std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Anchors))) {}
-
-void RefDetectionPostProcessUint8Workload::Execute() const
-{
-    ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefDetectionPostProcessUint8Workload_Execute");
-
-    const TensorInfo& boxEncodingsInfo = GetTensorInfo(m_Data.m_Inputs[0]);
-    const TensorInfo& scoresInfo = GetTensorInfo(m_Data.m_Inputs[1]);
-    const TensorInfo& anchorsInfo = GetTensorInfo(m_Anchors.get());
-    const TensorInfo& detectionBoxesInfo = GetTensorInfo(m_Data.m_Outputs[0]);
-    const TensorInfo& detectionClassesInfo = GetTensorInfo(m_Data.m_Outputs[1]);
-    const TensorInfo& detectionScoresInfo = GetTensorInfo(m_Data.m_Outputs[2]);
-    const TensorInfo& numDetectionsInfo = GetTensorInfo(m_Data.m_Outputs[3]);
-
-    const uint8_t* boxEncodingsData = GetInputTensorDataU8(0, m_Data);
-    const uint8_t* scoresData = GetInputTensorDataU8(1, m_Data);
-    const uint8_t* anchorsData = m_Anchors->GetConstTensor<uint8_t>();
-
-    auto boxEncodings = Dequantize(boxEncodingsData, boxEncodingsInfo);
-    auto scores = Dequantize(scoresData, scoresInfo);
-    auto anchors = Dequantize(anchorsData, anchorsInfo);
-
-    float* detectionBoxes = GetOutputTensorData<float>(0, m_Data);
-    float* detectionClasses = GetOutputTensorData<float>(1, m_Data);
-    float* detectionScores = GetOutputTensorData<float>(2, m_Data);
-    float* numDetections = GetOutputTensorData<float>(3, m_Data);
-
-    DetectionPostProcess(boxEncodingsInfo, scoresInfo, anchorsInfo,
-                         detectionBoxesInfo, detectionClassesInfo,
-                         detectionScoresInfo, numDetectionsInfo, m_Data.m_Parameters,
-                         boxEncodings.data(), scores.data(), anchors.data(),
-                         detectionBoxes, detectionClasses, detectionScores, numDetections);
-}
-
-} //namespace armnn
diff --git a/src/backends/reference/workloads/RefDetectionPostProcessUint8Workload.hpp b/src/backends/reference/workloads/RefDetectionPostProcessUint8Workload.hpp
deleted file mode 100644
index 91590f5..0000000
--- a/src/backends/reference/workloads/RefDetectionPostProcessUint8Workload.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-//
-// Copyright © 2017 Arm Ltd. All rights reserved.
-// SPDX-License-Identifier: MIT
-//
-
-#pragma once
-
-#include <backendsCommon/Workload.hpp>
-#include <backendsCommon/WorkloadData.hpp>
-
-namespace armnn
-{
-
-class RefDetectionPostProcessUint8Workload : public Uint8ToFloat32Workload<DetectionPostProcessQueueDescriptor>
-{
-public:
-    explicit RefDetectionPostProcessUint8Workload(const DetectionPostProcessQueueDescriptor& descriptor,
-                                                  const WorkloadInfo& info);
-    virtual void Execute() const override;
-
-private:
-    std::unique_ptr<ScopedCpuTensorHandle> m_Anchors;
-};
-
-} //namespace armnn
diff --git a/src/backends/reference/workloads/RefDetectionPostProcessWorkload.cpp b/src/backends/reference/workloads/RefDetectionPostProcessWorkload.cpp
new file mode 100644
index 0000000..db24cc5
--- /dev/null
+++ b/src/backends/reference/workloads/RefDetectionPostProcessWorkload.cpp
@@ -0,0 +1,50 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "RefDetectionPostProcessWorkload.hpp"
+
+#include "Decoders.hpp"
+#include "DetectionPostProcess.hpp"
+#include "Profiling.hpp"
+#include "RefWorkloadUtils.hpp"
+
+namespace armnn
+{
+
+RefDetectionPostProcessWorkload::RefDetectionPostProcessWorkload(
+        const DetectionPostProcessQueueDescriptor& descriptor, const WorkloadInfo& info)
+        : BaseWorkload<DetectionPostProcessQueueDescriptor>(descriptor, info),
+          m_Anchors(std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Anchors))) {}
+
+void RefDetectionPostProcessWorkload::Execute() const
+{
+    ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefDetectionPostProcessWorkload_Execute");
+
+    const TensorInfo& boxEncodingsInfo = GetTensorInfo(m_Data.m_Inputs[0]);
+    const TensorInfo& scoresInfo       = GetTensorInfo(m_Data.m_Inputs[1]);
+    const TensorInfo& anchorsInfo      = GetTensorInfo(m_Anchors.get());
+
+    const TensorInfo& detectionBoxesInfo   = GetTensorInfo(m_Data.m_Outputs[0]);
+    const TensorInfo& detectionClassesInfo = GetTensorInfo(m_Data.m_Outputs[1]);
+    const TensorInfo& detectionScoresInfo  = GetTensorInfo(m_Data.m_Outputs[2]);
+    const TensorInfo& numDetectionsInfo    = GetTensorInfo(m_Data.m_Outputs[3]);
+
+    auto boxEncodings = MakeDecoder<float>(boxEncodingsInfo, m_Data.m_Inputs[0]->Map());
+    auto scores       = MakeDecoder<float>(scoresInfo, m_Data.m_Inputs[1]->Map());
+    auto anchors      = MakeDecoder<float>(anchorsInfo, m_Anchors->Map(false));
+
+    float* detectionBoxes   = GetOutputTensorData<float>(0, m_Data);
+    float* detectionClasses = GetOutputTensorData<float>(1, m_Data);
+    float* detectionScores  = GetOutputTensorData<float>(2, m_Data);
+    float* numDetections    = GetOutputTensorData<float>(3, m_Data);
+
+    DetectionPostProcess(boxEncodingsInfo, scoresInfo, anchorsInfo,
+                         detectionBoxesInfo, detectionClassesInfo,
+                         detectionScoresInfo, numDetectionsInfo, m_Data.m_Parameters,
+                         *boxEncodings, *scores, *anchors, detectionBoxes,
+                         detectionClasses, detectionScores, numDetections);
+}
+
+} //namespace armnn
diff --git a/src/backends/reference/workloads/RefDetectionPostProcessWorkload.hpp b/src/backends/reference/workloads/RefDetectionPostProcessWorkload.hpp
new file mode 100644
index 0000000..799d0c6
--- /dev/null
+++ b/src/backends/reference/workloads/RefDetectionPostProcessWorkload.hpp
@@ -0,0 +1,25 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backendsCommon/Workload.hpp>
+#include <backendsCommon/WorkloadData.hpp>
+
+namespace armnn
+{
+
+class RefDetectionPostProcessWorkload : public BaseWorkload<DetectionPostProcessQueueDescriptor>
+{
+public:
+    explicit RefDetectionPostProcessWorkload(const DetectionPostProcessQueueDescriptor& descriptor,
+                                             const WorkloadInfo& info);
+    virtual void Execute() const override;
+
+private:
+    std::unique_ptr<ScopedCpuTensorHandle> m_Anchors;
+};
+
+} //namespace armnn
diff --git a/src/backends/reference/workloads/RefWorkloads.hpp b/src/backends/reference/workloads/RefWorkloads.hpp
index 7ccd4ef..8d99b69 100644
--- a/src/backends/reference/workloads/RefWorkloads.hpp
+++ b/src/backends/reference/workloads/RefWorkloads.hpp
@@ -31,8 +31,7 @@
 #include "RefResizeBilinearFloat32Workload.hpp"
 #include "ResizeBilinear.hpp"
 #include "RefNormalizationFloat32Workload.hpp"
-#include "RefDetectionPostProcessFloat32Workload.hpp"
-#include "RefDetectionPostProcessUint8Workload.hpp"
+#include "RefDetectionPostProcessWorkload.hpp"
 #include "RefPooling2dUint8Workload.hpp"
 #include "BatchNormImpl.hpp"
 #include "Activation.hpp"