Add new op HEATMAP_MAX_KEYPOINT.

Add reference CPU implementation for HEATMAP_MAX_KEYPOINT with NHWC data
layout and FP32 input data type.

Create tests for the new op. The testcases are verified by execution on
Caffe2 lib with the same op. Generate cts/vts tests.

Bug: 113562572

Test: NeuralNetworksTest_static
Change-Id: I75c8edbe93bf862b87866024eeb1bad8bb9be3dc
Merged-In: I75c8edbe93bf862b87866024eeb1bad8bb9be3dc
(cherry picked from commit 012748d5ae792f83c1bfc971ea820d3e4fb63553)
diff --git a/common/Android.bp b/common/Android.bp
index 37a2777..2369c2c 100644
--- a/common/Android.bp
+++ b/common/Android.bp
@@ -67,6 +67,7 @@
         "operations/ExpandDims.cpp",
         "operations/FullyConnected.cpp",
         "operations/HashtableLookup.cpp",
+        "operations/HeatmapMaxKeypoint.cpp",
         "operations/LSHProjection.cpp",
         "operations/LSTM.cpp",
         "operations/Normalization.cpp",
diff --git a/common/CpuExecutor.cpp b/common/CpuExecutor.cpp
index ff12d06..e9e983b 100644
--- a/common/CpuExecutor.cpp
+++ b/common/CpuExecutor.cpp
@@ -1530,6 +1530,27 @@
                                    reinterpret_cast<float*>(out.buffer), outShape);
             }
         } break;
+        case OperationType::HEATMAP_MAX_KEYPOINT: {
+            if (!allParametersPresent(2, 1)) {
+                return ANEURALNETWORKS_BAD_DATA;
+            }
+            const RunTimeOperandInfo& heatmap = mOperands[ins[0]];
+            const RunTimeOperandInfo& boxes = mOperands[ins[1]];
+
+            RunTimeOperandInfo& out = mOperands[outs[0]];
+            Shape outShape = out.shape();
+
+            if (heatmap.type == OperandType::TENSOR_FLOAT32) {
+                success = heatmapMaxKeypointPrepare(heatmap.shape(),
+                                                    reinterpret_cast<const float*>(boxes.buffer),
+                                                    boxes.shape(), &outShape) &&
+                          setInfoAndAllocateIfNeeded(&out, outShape) &&
+                          heatmapMaxKeypoint(
+                                  reinterpret_cast<const float*>(heatmap.buffer), heatmap.shape(),
+                                  reinterpret_cast<const float*>(boxes.buffer), boxes.shape(),
+                                  reinterpret_cast<float*>(out.buffer), outShape);
+            }
+        } break;
         default:
             nnAssert(false);
             break;
diff --git a/common/OperationsUtils.cpp b/common/OperationsUtils.cpp
index 97f34ab..1168a77 100644
--- a/common/OperationsUtils.cpp
+++ b/common/OperationsUtils.cpp
@@ -991,5 +991,34 @@
     return true;
 }
 
+bool heatmapMaxKeypointPrepare(const Shape& heatmapShape, const float* boxesData,
+                               const Shape& boxesShape, Shape* output) {
+    uint32_t numBoxes = getSizeOfDimension(heatmapShape, 0);
+    uint32_t heatmapSize = getSizeOfDimension(heatmapShape, 1);
+    uint32_t numKeypoints = getSizeOfDimension(heatmapShape, 3);
+    uint32_t boxInfoLength = getSizeOfDimension(boxesShape, 1);
+
+    NN_OPS_CHECK(getNumberOfDimensions(heatmapShape) == 4);
+    NN_OPS_CHECK(getNumberOfDimensions(boxesShape) == 2);
+
+    NN_OPS_CHECK(getSizeOfDimension(heatmapShape, 2) == heatmapSize);
+    NN_OPS_CHECK(heatmapSize >= 2);
+
+    NN_OPS_CHECK(getSizeOfDimension(boxesShape, 0) == numBoxes);
+    NN_OPS_CHECK(boxInfoLength == 4);
+
+    const float* boxesDataEnd = boxesData + numBoxes * boxInfoLength;
+    for (const float* boxInfo = boxesData; boxInfo < boxesDataEnd; boxInfo += boxInfoLength) {
+        NN_OPS_CHECK(boxInfo[0] < boxInfo[2]);
+        NN_OPS_CHECK(boxInfo[1] < boxInfo[3]);
+    }
+
+    output->type = heatmapShape.type;
+    output->dimensions = {numBoxes, numKeypoints, 3};
+    output->offset = heatmapShape.offset;
+    output->scale = heatmapShape.scale;
+
+    return true;
+}
 } // namespace nn
 } // namespace android
diff --git a/common/Utils.cpp b/common/Utils.cpp
index 1b0cfd1..c193f2e 100644
--- a/common/Utils.cpp
+++ b/common/Utils.cpp
@@ -1596,6 +1596,19 @@
                                                  inExpectedTypes, outputCount, outputIndexes,
                                                  outExpectedTypes);
         }
+        case ANEURALNETWORKS_HEATMAP_MAX_KEYPOINT: {
+            if (inputCount != 2 || outputCount != 1) {
+                logInvalidInOutNumber(2, 1);
+                return ANEURALNETWORKS_BAD_DATA;
+            }
+            std::vector<OperandType> inExpectedTypes;
+            std::vector<OperandType> outExpectedTypes;
+            inExpectedTypes = {OperandType::TENSOR_FLOAT32, OperandType::TENSOR_FLOAT32};
+            outExpectedTypes = {OperandType::TENSOR_FLOAT32};
+            return validateOperationOperandTypes(operands, inputCount, inputIndexes,
+                                                 inExpectedTypes, outputCount, outputIndexes,
+                                                 outExpectedTypes);
+        }
         default:
             return ANEURALNETWORKS_BAD_DATA;
     }
diff --git a/common/include/Operations.h b/common/include/Operations.h
index 641b522..2e09409 100644
--- a/common/include/Operations.h
+++ b/common/include/Operations.h
@@ -254,6 +254,9 @@
 bool roiAlign(const float* inputData, const Shape& inputShape, const float* roiData,
               const Shape& roiShape, float spatialScale, int32_t samplingRatio, float* outputData,
               const Shape& outputShape);
+
+bool heatmapMaxKeypoint(const float* heatmap, const Shape& heatmapShape, const float* boxes,
+                        const Shape& boxesShape, float* outputData, const Shape& outputShape);
 } // namespace nn
 } // namespace android
 #endif // ANDROID_ML_NN_COMMON_OPERATIONS_H
diff --git a/common/include/OperationsUtils.h b/common/include/OperationsUtils.h
index d1198f2..3c39831 100644
--- a/common/include/OperationsUtils.h
+++ b/common/include/OperationsUtils.h
@@ -294,6 +294,9 @@
 bool roiAlignPrepare(const Shape& input, const float* roiData, const Shape& roiShape,
                      const int32_t* outputShapeData, const Shape& outputShapeShape,
                      const float spatialScale, Shape* output);
+
+bool heatmapMaxKeypointPrepare(const Shape& heatmapShape, const float* boxesData,
+                               const Shape& boxesShape, Shape* output);
 } // namespace nn
 } // namespace android
 
diff --git a/common/operations/HeatmapMaxKeypoint.cpp b/common/operations/HeatmapMaxKeypoint.cpp
new file mode 100644
index 0000000..e17ad71
--- /dev/null
+++ b/common/operations/HeatmapMaxKeypoint.cpp
@@ -0,0 +1,140 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CpuOperationUtils.h"
+#include "Operations.h"
+
+#include <cfloat>
+#include <cmath>
+
+#include "Tracing.h"
+
+namespace android {
+namespace nn {
+
+// This function uses Taylor expansion up to the quatratic term to approximate bicubic
+// upscaling result.
+// 2nd order Taylor expansion: D(x) = D - b'x + 1/2 * x'Ax
+// where D = grid[1][1], Taylor expansion center, the original score,
+//       x = delta, the correction on max keypoint position,
+//       D(x) = deltaScore, the accuracy score after correction
+static void solveForDelta(const float grid[3][3], float* delta, float* deltaScore) {
+    // b: negative 1st order derivative at center
+    // A: Hessian matrix at center (2nd order derivative)
+    float A[2][2], b[2];
+    b[0] = -(grid[1][2] - grid[1][0]) / 2.0f;
+    b[1] = -(grid[2][1] - grid[0][1]) / 2.0f;
+    A[0][0] = grid[1][0] - 2.0f * grid[1][1] + grid[1][2];
+    A[0][1] = (grid[2][2] - grid[2][0] - grid[0][2] + grid[0][0]) / 4.0f;
+    A[1][0] = A[0][1];
+    A[1][1] = grid[0][1] - 2.0f * grid[1][1] + grid[2][1];
+
+    // solve Ax=b, where x=delta -> delta = inv(A) * b
+    float crossProd1 = A[0][0] * A[1][1], crossProd2 = A[0][1] * A[1][0];
+    float detA = crossProd1 - crossProd2;
+    // check if A is invertible
+    if (std::abs(detA) < std::max(1e-5f, 1e-5f * crossProd1)) return;
+    delta[0] = (A[1][1] * b[0] - A[0][1] * b[1]) / detA;
+    delta[1] = (A[0][0] * b[1] - A[1][0] * b[0]) / detA;
+
+    // clip out of range delta, i.e. delta > 3/2
+    if (std::abs(delta[0]) > 1.5f || std::abs(delta[1]) > 1.5f) {
+        float scale = 1.5f / std::max(std::abs(delta[0]), std::abs(delta[1]));
+        delta[0] *= scale;
+        delta[1] *= scale;
+    }
+
+    *deltaScore = grid[1][1] - b[0] * delta[0] - b[1] * delta[1] +
+                  ((A[0][0] * delta[0] + A[0][1] * delta[1]) * delta[0] +
+                   (A[1][0] * delta[0] + A[1][1] * delta[1]) * delta[1]) /
+                          2.0f;
+}
+
+bool heatmapMaxKeypoint(const float* heatmap, const Shape& heatmapShape, const float* boxes,
+                        const Shape& boxesShape, float* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("HeatmapMaxKeypoint");
+
+    uint32_t numBoxes = getSizeOfDimension(heatmapShape, 0);
+    uint32_t heatmapSize = getSizeOfDimension(heatmapShape, 1);
+    uint32_t numKeypoints = getSizeOfDimension(heatmapShape, 3);
+    uint32_t boxInfoLength = getSizeOfDimension(boxesShape, 1);
+    uint32_t outputInfoLength = getSizeOfDimension(outputShape, 2);
+
+    const float* heatmapBase = heatmap;
+    const float* boxInfoBase = boxes;
+    float* outputBase = outputData;
+    for (uint32_t i = 0; i < numBoxes; i++) {
+        for (uint32_t j = 0; j < numKeypoints; j++) {
+            // find max score and its index
+            uint32_t maxIndex = 0;
+            float maxScore = -FLT_MAX;
+            for (uint32_t k = 0; k < heatmapSize * heatmapSize; k++) {
+                float val = heatmapBase[k * numKeypoints + j];
+                if (maxScore < val) {
+                    maxScore = val;
+                    maxIndex = k;
+                }
+            }
+
+            uint32_t maxIndexWidth = maxIndex % heatmapSize;
+            uint32_t maxIndexHeight = maxIndex / heatmapSize;
+
+            // get local 3x3 grid
+            float localGrid[3][3];
+            for (int32_t dh = -1; dh <= 1; dh++) {
+                for (int32_t dw = -1; dw <= 1; dw++) {
+                    // cast uint32_t to int32_t
+                    int32_t h = static_cast<int32_t>(maxIndexHeight) + dh;
+                    int32_t w = static_cast<int32_t>(maxIndexWidth) + dw;
+
+                    // use mirroring for out of bound indexing
+                    // need to ensure heatmapSize >= 2
+                    h = h < 0 ? 1 : (h >= heatmapSize ? heatmapSize - 2 : h);
+                    w = w < 0 ? 1 : (w >= heatmapSize ? heatmapSize - 2 : w);
+
+                    uint32_t heatmapIndex = static_cast<uint32_t>(h) * heatmapSize * numKeypoints +
+                                            static_cast<uint32_t>(w) * numKeypoints + j;
+                    localGrid[dh + 1][dw + 1] = heatmapBase[heatmapIndex];
+                }
+            }
+
+            float delta[2] = {0.0f, 0.0f}, deltaScore = maxScore;
+            solveForDelta(localGrid, delta, &deltaScore);
+
+            float wRoiStart = boxInfoBase[0];
+            float hRoiStart = boxInfoBase[1];
+            float wRoiEnd = boxInfoBase[2];
+            float hRoiEnd = boxInfoBase[3];
+            float roiWidth = wRoiEnd - wRoiStart;
+            float roiHeight = hRoiEnd - hRoiStart;
+            float wRelativePos = (static_cast<float>(maxIndexWidth) + delta[0] + 0.5f) /
+                                 static_cast<float>(heatmapSize);
+            float hRelativePos = (static_cast<float>(maxIndexHeight) + delta[1] + 0.5f) /
+                                 static_cast<float>(heatmapSize);
+            outputBase[0] = wRelativePos * roiWidth + wRoiStart;
+            outputBase[1] = hRelativePos * roiHeight + hRoiStart;
+            outputBase[2] = deltaScore;
+            outputBase += outputInfoLength;
+        }
+        boxInfoBase += boxInfoLength;
+        heatmapBase += heatmapSize * heatmapSize * numKeypoints;
+    }
+
+    return true;
+}
+
+}  // namespace nn
+}  // namespace android
diff --git a/runtime/include/NeuralNetworks.h b/runtime/include/NeuralNetworks.h
index 2bb083a..ed59585 100644
--- a/runtime/include/NeuralNetworks.h
+++ b/runtime/include/NeuralNetworks.h
@@ -1933,6 +1933,36 @@
     ANEURALNETWORKS_GREATER = 53,
     ANEURALNETWORKS_GREATER_EQUAL = 54,
     ANEURALNETWORKS_GROUPED_CONV_2D = 55,
+
+    /**
+     * Localize the maximum keypoints from heatmaps.
+     *
+     * This operation approximates the accurate maximum keypoint scores and
+     * indices after bicubic upscaling by using Taylor expansion up to the
+     * quadratic term.
+     *
+     * The bounding box is represented by its upper-left corner coordinate
+     * (x1,y1) and lower-right corner coordinate (x2,y2) in the original image.
+     * A valid bounding box should satisfy x1 < x2 and y1 < y2.
+     *
+     * Supported tensor {@link OperandCode}:
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     *
+     * Inputs:
+     * * 0: A 4-D Tensor of shape
+     *      [num_boxes, heatmap_size, heatmap_size, num_keypoints],
+     *      specifying the heatmaps, the height and width of heatmaps should
+     *      be the same, and must be greater than or equal to 2.
+     * * 1: A 2-D Tensor of shape [num_boxes, 4], specifying the bounding boxes,
+     *      each with format [x1, y1, x2, y2].
+     *
+     * Outputs:
+     * * 0: A tensor of the same {@link OperandCode} as input0, with shape
+     *      [num_boxes, num_keypoints, 3], specifying the location and score of
+     *      the keypoints, each with format [keypoint_x, keypoint_y, score].
+     *
+     * Available since API level 29.
+     */
     ANEURALNETWORKS_HEATMAP_MAX_KEYPOINT = 56,
     ANEURALNETWORKS_LESS = 57,
     ANEURALNETWORKS_LESS_EQUAL = 58,
diff --git a/runtime/test/for-cts/TestGeneratedOneFile.cpp b/runtime/test/for-cts/TestGeneratedOneFile.cpp
index 2ada5ac..4ec1b34 100644
--- a/runtime/test/for-cts/TestGeneratedOneFile.cpp
+++ b/runtime/test/for-cts/TestGeneratedOneFile.cpp
@@ -344,6 +344,7 @@
 #include "../generated/tests/argmin_2_quant8.mod.py.cpp"
 #include "../generated/tests/argmin_3_float.mod.py.cpp"
 #include "../generated/tests/expand_dims.mod.py.cpp"
+#include "../generated/tests/heatmap_max_keypoint.mod.py.cpp"
 #include "../generated/tests/lsh_projection_3_relaxed.mod.py.cpp"
 #include "../generated/tests/lsh_projection_4_relaxed.mod.py.cpp"
 #include "../generated/tests/lsh_projection_deprecated.mod.py.cpp"
diff --git a/runtime/test/generated/all_generated_V1_2_vts_tests.cpp b/runtime/test/generated/all_generated_V1_2_vts_tests.cpp
index 604c37b..5008d7a 100644
--- a/runtime/test/generated/all_generated_V1_2_vts_tests.cpp
+++ b/runtime/test/generated/all_generated_V1_2_vts_tests.cpp
@@ -391,6 +391,42 @@
                            expand_dims::examples_int32_4);
 }
 
+// Generated from: heatmap_max_keypoint.mod.py.
+namespace heatmap_max_keypoint {
+// Generated heatmap_max_keypoint test
+#include "examples/heatmap_max_keypoint.example.cpp"
+// Generated model constructor
+#include "vts_models/heatmap_max_keypoint.model.cpp"
+} // namespace heatmap_max_keypoint
+
+TEST_F(NeuralnetworksHidlTest, heatmap_max_keypoint) {
+  generated_tests::Execute(device,
+                           heatmap_max_keypoint::createTestModel,
+                           heatmap_max_keypoint::is_ignored,
+                           heatmap_max_keypoint::examples);
+}
+
+TEST_F(NeuralnetworksHidlTest, heatmap_max_keypoint_relaxed) {
+  generated_tests::Execute(device,
+                           heatmap_max_keypoint::createTestModel_relaxed,
+                           heatmap_max_keypoint::is_ignored_relaxed,
+                           heatmap_max_keypoint::examples_relaxed);
+}
+
+TEST_F(NeuralnetworksHidlTest, heatmap_max_keypoint_2) {
+  generated_tests::Execute(device,
+                           heatmap_max_keypoint::createTestModel_2,
+                           heatmap_max_keypoint::is_ignored_2,
+                           heatmap_max_keypoint::examples_2);
+}
+
+TEST_F(NeuralnetworksHidlTest, heatmap_max_keypoint_relaxed_2) {
+  generated_tests::Execute(device,
+                           heatmap_max_keypoint::createTestModel_relaxed_2,
+                           heatmap_max_keypoint::is_ignored_relaxed_2,
+                           heatmap_max_keypoint::examples_relaxed_2);
+}
+
 // Generated from: lsh_projection_3_relaxed.mod.py.
 namespace lsh_projection_3_relaxed {
 // Generated lsh_projection_3_relaxed test
diff --git a/runtime/test/generated/examples/heatmap_max_keypoint.example.cpp b/runtime/test/generated/examples/heatmap_max_keypoint.example.cpp
new file mode 100644
index 0000000..3f5e0bd
--- /dev/null
+++ b/runtime/test/generated/examples/heatmap_max_keypoint.example.cpp
@@ -0,0 +1,98 @@
+// clang-format off
+// Generated file (from: heatmap_max_keypoint.mod.py). Do not edit
+std::vector<MixedTypedExample> examples = {
+// Begin of an example
+{
+//Input(s)
+{ // See tools/test_generator/include/TestHarness.h:MixedTyped
+  // int -> FLOAT32 map
+  {{0, {-10.0f, -1.0f, 4.0f, -5.0f, -8.0f, -2.0f, 9.0f, 1.0f, 7.0f, -2.0f, 3.0f, -7.0f, -2.0f, 2.0f, -3.0f, 5.0f, -10.0f, -1.0f, 4.0f, -5.0f, -8.0f, -2.0f, 9.0f, 1.0f, 7.0f, -2.0f, 3.0f, -7.0f, -2.0f, 10.0f, -3.0f, 5.0f, -10.0f, -1.0f, 4.0f, -5.0f, -8.0f, -2.0f, 4.0f, 1.0f, 7.0f, -2.0f, 3.0f, -7.0f, -2.0f, 2.0f, -3.0f, 5.0f, -10.0f, -1.0f, 4.0f, 10.0f, -8.0f, -2.0f, 4.0f, 1.0f, 7.0f, -2.0f, 3.0f, -7.0f, -2.0f, 2.0f, -3.0f, 5.0f, -10.0f, -56.0f, 4.0f, -5.0f, -8.0f, -2.0f, 9.0f, 1.0f, 7.0f, -2.0f, 3.0f, -7.0f, -2.0f, 2.0f, -3.0f, 5.0f, -10.0f, -57.827329175f, 4.0f, -5.0f, -8.0f, -2.0f, 9.0f, 1.0f, 7.0f, -2.0f, 3.0f, -7.0f, -2.0f, 2.0f, -3.0f, 5.0f}}, {1, {5.0f, 2.0f, 10.0f, 20.0f, 1.0f, 7.0f, 30.0f, 10.0f, 8.0f, 3.0f, 15.0f, 13.0f, 6.0f, 5.0f, 19.0f, 12.0f, 5.0f, 2.0f, 10.0f, 20.0f, 5.0f, 2.0f, 10.0f, 20.0f}}},
+  // int -> INT32 map
+  {},
+  // int -> QUANT8_ASYMM map
+  {}
+},
+//Output(s)
+{ // See tools/test_generator/include/TestHarness.h:MixedTyped
+  // int -> FLOAT32 map
+  {{0, {8.224462f, 8.537316f, 9.071493f, 11.73f, 9.625f, 10.005f, 8.875f, 9.5625f, 7.1875f, 17.375f, 5.875f, 10.0f, 9.569672f, 2.0f, 10.689667f, 8.125f, 8.75f, 9.0f}}},
+  // int -> INT32 map
+  {},
+  // int -> QUANT8_ASYMM map
+  {}
+}
+}, // End of an example
+};
+
+std::vector<MixedTypedExample> examples_relaxed = {
+// Begin of an example
+{
+//Input(s)
+{ // See tools/test_generator/include/TestHarness.h:MixedTyped
+  // int -> FLOAT32 map
+  {{0, {-10.0f, -1.0f, 4.0f, -5.0f, -8.0f, -2.0f, 9.0f, 1.0f, 7.0f, -2.0f, 3.0f, -7.0f, -2.0f, 2.0f, -3.0f, 5.0f, -10.0f, -1.0f, 4.0f, -5.0f, -8.0f, -2.0f, 9.0f, 1.0f, 7.0f, -2.0f, 3.0f, -7.0f, -2.0f, 10.0f, -3.0f, 5.0f, -10.0f, -1.0f, 4.0f, -5.0f, -8.0f, -2.0f, 4.0f, 1.0f, 7.0f, -2.0f, 3.0f, -7.0f, -2.0f, 2.0f, -3.0f, 5.0f, -10.0f, -1.0f, 4.0f, 10.0f, -8.0f, -2.0f, 4.0f, 1.0f, 7.0f, -2.0f, 3.0f, -7.0f, -2.0f, 2.0f, -3.0f, 5.0f, -10.0f, -56.0f, 4.0f, -5.0f, -8.0f, -2.0f, 9.0f, 1.0f, 7.0f, -2.0f, 3.0f, -7.0f, -2.0f, 2.0f, -3.0f, 5.0f, -10.0f, -57.827329175f, 4.0f, -5.0f, -8.0f, -2.0f, 9.0f, 1.0f, 7.0f, -2.0f, 3.0f, -7.0f, -2.0f, 2.0f, -3.0f, 5.0f}}, {1, {5.0f, 2.0f, 10.0f, 20.0f, 1.0f, 7.0f, 30.0f, 10.0f, 8.0f, 3.0f, 15.0f, 13.0f, 6.0f, 5.0f, 19.0f, 12.0f, 5.0f, 2.0f, 10.0f, 20.0f, 5.0f, 2.0f, 10.0f, 20.0f}}},
+  // int -> INT32 map
+  {},
+  // int -> QUANT8_ASYMM map
+  {}
+},
+//Output(s)
+{ // See tools/test_generator/include/TestHarness.h:MixedTyped
+  // int -> FLOAT32 map
+  {{0, {8.224462f, 8.537316f, 9.071493f, 11.73f, 9.625f, 10.005f, 8.875f, 9.5625f, 7.1875f, 17.375f, 5.875f, 10.0f, 9.569672f, 2.0f, 10.689667f, 8.125f, 8.75f, 9.0f}}},
+  // int -> INT32 map
+  {},
+  // int -> QUANT8_ASYMM map
+  {}
+}
+}, // End of an example
+};
+
+std::vector<MixedTypedExample> examples_2 = {
+// Begin of an example
+{
+//Input(s)
+{ // See tools/test_generator/include/TestHarness.h:MixedTyped
+  // int -> FLOAT32 map
+  {{0, {0.19f, 0.61f, 0.49f, 0.01f, 0.98f, 0.65f, 0.64f, 0.7f, 0.76f, 0.55f, 0.83f, 0.19f, 0.46f, 0.03f, 0.67f, 0.71f, 0.17f, 0.23f, 0.89f, 0.08f, 0.96f, 0.65f, 0.52f, 0.4f, 0.36f, 0.8f, 0.55f, 0.89f, 0.58f, 0.29f, 0.27f, 0.69f, 0.66f, 0.06f, 0.51f, 0.26f, 0.96f, 0.38f, 0.41f, 0.89f, 0.88f, 0.46f, 0.96f, 0.73f, 0.54f, 0.64f, 0.84f, 0.74f, 0.51f, 0.41f, 0.13f, 0.19f, 0.52f, 0.21f, 0.5f, 0.75f, 0.89f, 0.89f, 0.2f, 0.58f, 0.7f, 0.13f, 0.29f, 0.39f, 0.91f, 0.06f, 0.93f, 0.34f, 0.8f, 0.87f, 0.59f, 0.67f, 0.57f, 0.85f, 0.24f, 0.25f, 0.76f, 0.34f, 0.37f, 0.11f, 0.0f, 0.29f, 0.3f, 0.77f, 0.34f, 0.57f, 0.48f, 0.76f, 0.93f, 0.18f, 0.64f, 0.12f, 0.67f, 0.47f, 0.56f, 0.5f, 0.48f, 0.99f, 0.46f, 0.66f, 0.98f, 0.06f, 0.1f, 0.66f, 0.66f, 0.91f, 0.67f, 0.23f, 0.4f, 0.37f, 0.17f, 0.35f, 0.48f, 0.98f, 0.47f, 0.49f, 0.56f, 0.18f, 0.75f, 0.29f, 0.04f, 0.23f, 0.42f, 0.55f, 0.38f, 0.07f, 0.71f, 0.8f}}, {1, {5.0f, 2.0f, 10.0f, 20.0f, 1.0f, 7.0f, 30.0f, 10.0f}}},
+  // int -> INT32 map
+  {},
+  // int -> QUANT8_ASYMM map
+  {}
+},
+//Output(s)
+{ // See tools/test_generator/include/TestHarness.h:MixedTyped
+  // int -> FLOAT32 map
+  {{0, {7.227723f, 4.25f, 1.02021f, 8.090278f, 17.75f, 0.890556f, 8.523379f, 12.589181f, 1.00711f, 8.36558f, 10.122508f, 0.945129f, 12.431603f, 8.934225f, 0.987798f, 4.625f, 9.239437f, 1.07382f, 4.625f, 7.375f, 0.93f, 26.375f, 9.625f, 0.8f}}},
+  // int -> INT32 map
+  {},
+  // int -> QUANT8_ASYMM map
+  {}
+}
+}, // End of an example
+};
+
+std::vector<MixedTypedExample> examples_relaxed_2 = {
+// Begin of an example
+{
+//Input(s)
+{ // See tools/test_generator/include/TestHarness.h:MixedTyped
+  // int -> FLOAT32 map
+  {{0, {0.19f, 0.61f, 0.49f, 0.01f, 0.98f, 0.65f, 0.64f, 0.7f, 0.76f, 0.55f, 0.83f, 0.19f, 0.46f, 0.03f, 0.67f, 0.71f, 0.17f, 0.23f, 0.89f, 0.08f, 0.96f, 0.65f, 0.52f, 0.4f, 0.36f, 0.8f, 0.55f, 0.89f, 0.58f, 0.29f, 0.27f, 0.69f, 0.66f, 0.06f, 0.51f, 0.26f, 0.96f, 0.38f, 0.41f, 0.89f, 0.88f, 0.46f, 0.96f, 0.73f, 0.54f, 0.64f, 0.84f, 0.74f, 0.51f, 0.41f, 0.13f, 0.19f, 0.52f, 0.21f, 0.5f, 0.75f, 0.89f, 0.89f, 0.2f, 0.58f, 0.7f, 0.13f, 0.29f, 0.39f, 0.91f, 0.06f, 0.93f, 0.34f, 0.8f, 0.87f, 0.59f, 0.67f, 0.57f, 0.85f, 0.24f, 0.25f, 0.76f, 0.34f, 0.37f, 0.11f, 0.0f, 0.29f, 0.3f, 0.77f, 0.34f, 0.57f, 0.48f, 0.76f, 0.93f, 0.18f, 0.64f, 0.12f, 0.67f, 0.47f, 0.56f, 0.5f, 0.48f, 0.99f, 0.46f, 0.66f, 0.98f, 0.06f, 0.1f, 0.66f, 0.66f, 0.91f, 0.67f, 0.23f, 0.4f, 0.37f, 0.17f, 0.35f, 0.48f, 0.98f, 0.47f, 0.49f, 0.56f, 0.18f, 0.75f, 0.29f, 0.04f, 0.23f, 0.42f, 0.55f, 0.38f, 0.07f, 0.71f, 0.8f}}, {1, {5.0f, 2.0f, 10.0f, 20.0f, 1.0f, 7.0f, 30.0f, 10.0f}}},
+  // int -> INT32 map
+  {},
+  // int -> QUANT8_ASYMM map
+  {}
+},
+//Output(s)
+{ // See tools/test_generator/include/TestHarness.h:MixedTyped
+  // int -> FLOAT32 map
+  {{0, {7.227723f, 4.25f, 1.02021f, 8.090278f, 17.75f, 0.890556f, 8.523379f, 12.589181f, 1.00711f, 8.36558f, 10.122508f, 0.945129f, 12.431603f, 8.934225f, 0.987798f, 4.625f, 9.239437f, 1.07382f, 4.625f, 7.375f, 0.93f, 26.375f, 9.625f, 0.8f}}},
+  // int -> INT32 map
+  {},
+  // int -> QUANT8_ASYMM map
+  {}
+}
+}, // End of an example
+};
+
diff --git a/runtime/test/generated/models/heatmap_max_keypoint.model.cpp b/runtime/test/generated/models/heatmap_max_keypoint.model.cpp
new file mode 100644
index 0000000..6bc0161
--- /dev/null
+++ b/runtime/test/generated/models/heatmap_max_keypoint.model.cpp
@@ -0,0 +1,106 @@
+// clang-format off
+// Generated file (from: heatmap_max_keypoint.mod.py). Do not edit
+void CreateModel(Model *model) {
+  OperandType type0(Type::TENSOR_FLOAT32, {6, 4, 4, 1});
+  OperandType type1(Type::TENSOR_FLOAT32, {6, 4});
+  OperandType type2(Type::TENSOR_FLOAT32, {6, 1, 3});
+  OperandType type3(Type::TENSOR_FLOAT32, {2, 4, 4, 4});
+  OperandType type4(Type::TENSOR_FLOAT32, {2, 4});
+  OperandType type5(Type::TENSOR_FLOAT32, {2, 4, 3});
+  // Phase 1, operands
+  auto heatmap = model->addOperand(&type0);
+  auto boxes = model->addOperand(&type1);
+  auto out = model->addOperand(&type2);
+  // Phase 2, operations
+  model->addOperation(ANEURALNETWORKS_HEATMAP_MAX_KEYPOINT, {heatmap, boxes}, {out});
+  // Phase 3, inputs and outputs
+  model->identifyInputsAndOutputs(
+    {heatmap, boxes},
+    {out});
+  assert(model->isValid());
+}
+
+bool is_ignored(int i) {
+  static std::set<int> ignore = {};
+  return ignore.find(i) != ignore.end();
+}
+
+void CreateModel_relaxed(Model *model) {
+  OperandType type0(Type::TENSOR_FLOAT32, {6, 4, 4, 1});
+  OperandType type1(Type::TENSOR_FLOAT32, {6, 4});
+  OperandType type2(Type::TENSOR_FLOAT32, {6, 1, 3});
+  OperandType type3(Type::TENSOR_FLOAT32, {2, 4, 4, 4});
+  OperandType type4(Type::TENSOR_FLOAT32, {2, 4});
+  OperandType type5(Type::TENSOR_FLOAT32, {2, 4, 3});
+  // Phase 1, operands
+  auto heatmap = model->addOperand(&type0);
+  auto boxes = model->addOperand(&type1);
+  auto out = model->addOperand(&type2);
+  // Phase 2, operations
+  model->addOperation(ANEURALNETWORKS_HEATMAP_MAX_KEYPOINT, {heatmap, boxes}, {out});
+  // Phase 3, inputs and outputs
+  model->identifyInputsAndOutputs(
+    {heatmap, boxes},
+    {out});
+  // Phase 4: set relaxed execution
+  model->relaxComputationFloat32toFloat16(true);
+  assert(model->isValid());
+}
+
+bool is_ignored_relaxed(int i) {
+  static std::set<int> ignore = {};
+  return ignore.find(i) != ignore.end();
+}
+
+void CreateModel_2(Model *model) {
+  OperandType type0(Type::TENSOR_FLOAT32, {6, 4, 4, 1});
+  OperandType type1(Type::TENSOR_FLOAT32, {6, 4});
+  OperandType type2(Type::TENSOR_FLOAT32, {6, 1, 3});
+  OperandType type3(Type::TENSOR_FLOAT32, {2, 4, 4, 4});
+  OperandType type4(Type::TENSOR_FLOAT32, {2, 4});
+  OperandType type5(Type::TENSOR_FLOAT32, {2, 4, 3});
+  // Phase 1, operands
+  auto heatmap1 = model->addOperand(&type3);
+  auto boxes1 = model->addOperand(&type4);
+  auto out1 = model->addOperand(&type5);
+  // Phase 2, operations
+  model->addOperation(ANEURALNETWORKS_HEATMAP_MAX_KEYPOINT, {heatmap1, boxes1}, {out1});
+  // Phase 3, inputs and outputs
+  model->identifyInputsAndOutputs(
+    {heatmap1, boxes1},
+    {out1});
+  assert(model->isValid());
+}
+
+bool is_ignored_2(int i) {
+  static std::set<int> ignore = {};
+  return ignore.find(i) != ignore.end();
+}
+
+void CreateModel_relaxed_2(Model *model) {
+  OperandType type0(Type::TENSOR_FLOAT32, {6, 4, 4, 1});
+  OperandType type1(Type::TENSOR_FLOAT32, {6, 4});
+  OperandType type2(Type::TENSOR_FLOAT32, {6, 1, 3});
+  OperandType type3(Type::TENSOR_FLOAT32, {2, 4, 4, 4});
+  OperandType type4(Type::TENSOR_FLOAT32, {2, 4});
+  OperandType type5(Type::TENSOR_FLOAT32, {2, 4, 3});
+  // Phase 1, operands
+  auto heatmap1 = model->addOperand(&type3);
+  auto boxes1 = model->addOperand(&type4);
+  auto out1 = model->addOperand(&type5);
+  // Phase 2, operations
+  model->addOperation(ANEURALNETWORKS_HEATMAP_MAX_KEYPOINT, {heatmap1, boxes1}, {out1});
+  // Phase 3, inputs and outputs
+  model->identifyInputsAndOutputs(
+    {heatmap1, boxes1},
+    {out1});
+  // Phase 4: set relaxed execution
+  model->relaxComputationFloat32toFloat16(true);
+  assert(model->isValid());
+}
+
+bool is_ignored_relaxed_2(int i) {
+  static std::set<int> ignore = {};
+  return ignore.find(i) != ignore.end();
+}
+
diff --git a/runtime/test/generated/tests/heatmap_max_keypoint.mod.py.cpp b/runtime/test/generated/tests/heatmap_max_keypoint.mod.py.cpp
new file mode 100644
index 0000000..d1968ae
--- /dev/null
+++ b/runtime/test/generated/tests/heatmap_max_keypoint.mod.py.cpp
@@ -0,0 +1,35 @@
+// clang-format off
+// Generated file (from: heatmap_max_keypoint.mod.py). Do not edit
+#include "../../TestGenerated.h"
+
+namespace heatmap_max_keypoint {
+// Generated heatmap_max_keypoint test
+#include "generated/examples/heatmap_max_keypoint.example.cpp"
+// Generated model constructor
+#include "generated/models/heatmap_max_keypoint.model.cpp"
+} // namespace heatmap_max_keypoint
+
+TEST_F(GeneratedTests, heatmap_max_keypoint) {
+    execute(heatmap_max_keypoint::CreateModel,
+            heatmap_max_keypoint::is_ignored,
+            heatmap_max_keypoint::examples);
+}
+
+TEST_F(GeneratedTests, heatmap_max_keypoint_relaxed) {
+    execute(heatmap_max_keypoint::CreateModel_relaxed,
+            heatmap_max_keypoint::is_ignored_relaxed,
+            heatmap_max_keypoint::examples_relaxed);
+}
+
+TEST_F(GeneratedTests, heatmap_max_keypoint_2) {
+    execute(heatmap_max_keypoint::CreateModel_2,
+            heatmap_max_keypoint::is_ignored_2,
+            heatmap_max_keypoint::examples_2);
+}
+
+TEST_F(GeneratedTests, heatmap_max_keypoint_relaxed_2) {
+    execute(heatmap_max_keypoint::CreateModel_relaxed_2,
+            heatmap_max_keypoint::is_ignored_relaxed_2,
+            heatmap_max_keypoint::examples_relaxed_2);
+}
+
diff --git a/runtime/test/generated/vts_models/heatmap_max_keypoint.model.cpp b/runtime/test/generated/vts_models/heatmap_max_keypoint.model.cpp
new file mode 100644
index 0000000..052461d
--- /dev/null
+++ b/runtime/test/generated/vts_models/heatmap_max_keypoint.model.cpp
@@ -0,0 +1,244 @@
+// clang-format off
+// Generated file (from: heatmap_max_keypoint.mod.py). Do not edit
+// Create the model
+Model createTestModel() {
+    const std::vector<Operand> operands = {
+        {
+            .type = OperandType::TENSOR_FLOAT32,
+            .dimensions = {6, 4, 4, 1},
+            .numberOfConsumers = 1,
+            .scale = 0.0f,
+            .zeroPoint = 0,
+            .lifetime = OperandLifeTime::MODEL_INPUT,
+            .location = {.poolIndex = 0, .offset = 0, .length = 0},
+        },
+        {
+            .type = OperandType::TENSOR_FLOAT32,
+            .dimensions = {6, 4},
+            .numberOfConsumers = 1,
+            .scale = 0.0f,
+            .zeroPoint = 0,
+            .lifetime = OperandLifeTime::MODEL_INPUT,
+            .location = {.poolIndex = 0, .offset = 0, .length = 0},
+        },
+        {
+            .type = OperandType::TENSOR_FLOAT32,
+            .dimensions = {6, 1, 3},
+            .numberOfConsumers = 0,
+            .scale = 0.0f,
+            .zeroPoint = 0,
+            .lifetime = OperandLifeTime::MODEL_OUTPUT,
+            .location = {.poolIndex = 0, .offset = 0, .length = 0},
+        }
+    };
+
+    const std::vector<Operation> operations = {
+        {
+            .type = OperationType::HEATMAP_MAX_KEYPOINT,
+            .inputs = {0, 1},
+            .outputs = {2},
+        }
+    };
+
+    const std::vector<uint32_t> inputIndexes = {0, 1};
+    const std::vector<uint32_t> outputIndexes = {2};
+    std::vector<uint8_t> operandValues = {};
+    const std::vector<hidl_memory> pools = {};
+
+    return {
+        .operands = operands,
+        .operations = operations,
+        .inputIndexes = inputIndexes,
+        .outputIndexes = outputIndexes,
+        .operandValues = operandValues,
+        .pools = pools,
+    };
+}
+
+bool is_ignored(int i) {
+  static std::set<int> ignore = {};
+  return ignore.find(i) != ignore.end();
+}
+
+// Create the model
+Model createTestModel_relaxed() {
+    const std::vector<Operand> operands = {
+        {
+            .type = OperandType::TENSOR_FLOAT32,
+            .dimensions = {6, 4, 4, 1},
+            .numberOfConsumers = 1,
+            .scale = 0.0f,
+            .zeroPoint = 0,
+            .lifetime = OperandLifeTime::MODEL_INPUT,
+            .location = {.poolIndex = 0, .offset = 0, .length = 0},
+        },
+        {
+            .type = OperandType::TENSOR_FLOAT32,
+            .dimensions = {6, 4},
+            .numberOfConsumers = 1,
+            .scale = 0.0f,
+            .zeroPoint = 0,
+            .lifetime = OperandLifeTime::MODEL_INPUT,
+            .location = {.poolIndex = 0, .offset = 0, .length = 0},
+        },
+        {
+            .type = OperandType::TENSOR_FLOAT32,
+            .dimensions = {6, 1, 3},
+            .numberOfConsumers = 0,
+            .scale = 0.0f,
+            .zeroPoint = 0,
+            .lifetime = OperandLifeTime::MODEL_OUTPUT,
+            .location = {.poolIndex = 0, .offset = 0, .length = 0},
+        }
+    };
+
+    const std::vector<Operation> operations = {
+        {
+            .type = OperationType::HEATMAP_MAX_KEYPOINT,
+            .inputs = {0, 1},
+            .outputs = {2},
+        }
+    };
+
+    const std::vector<uint32_t> inputIndexes = {0, 1};
+    const std::vector<uint32_t> outputIndexes = {2};
+    std::vector<uint8_t> operandValues = {};
+    const std::vector<hidl_memory> pools = {};
+
+    return {
+        .operands = operands,
+        .operations = operations,
+        .inputIndexes = inputIndexes,
+        .outputIndexes = outputIndexes,
+        .operandValues = operandValues,
+        .pools = pools,
+        .relaxComputationFloat32toFloat16 = true,
+    };
+}
+
+bool is_ignored_relaxed(int i) {
+  static std::set<int> ignore = {};
+  return ignore.find(i) != ignore.end();
+}
+
+// Create the model
+Model createTestModel_2() {
+    const std::vector<Operand> operands = {
+        {
+            .type = OperandType::TENSOR_FLOAT32,
+            .dimensions = {2, 4, 4, 4},
+            .numberOfConsumers = 1,
+            .scale = 0.0f,
+            .zeroPoint = 0,
+            .lifetime = OperandLifeTime::MODEL_INPUT,
+            .location = {.poolIndex = 0, .offset = 0, .length = 0},
+        },
+        {
+            .type = OperandType::TENSOR_FLOAT32,
+            .dimensions = {2, 4},
+            .numberOfConsumers = 1,
+            .scale = 0.0f,
+            .zeroPoint = 0,
+            .lifetime = OperandLifeTime::MODEL_INPUT,
+            .location = {.poolIndex = 0, .offset = 0, .length = 0},
+        },
+        {
+            .type = OperandType::TENSOR_FLOAT32,
+            .dimensions = {2, 4, 3},
+            .numberOfConsumers = 0,
+            .scale = 0.0f,
+            .zeroPoint = 0,
+            .lifetime = OperandLifeTime::MODEL_OUTPUT,
+            .location = {.poolIndex = 0, .offset = 0, .length = 0},
+        }
+    };
+
+    const std::vector<Operation> operations = {
+        {
+            .type = OperationType::HEATMAP_MAX_KEYPOINT,
+            .inputs = {0, 1},
+            .outputs = {2},
+        }
+    };
+
+    const std::vector<uint32_t> inputIndexes = {0, 1};
+    const std::vector<uint32_t> outputIndexes = {2};
+    std::vector<uint8_t> operandValues = {};
+    const std::vector<hidl_memory> pools = {};
+
+    return {
+        .operands = operands,
+        .operations = operations,
+        .inputIndexes = inputIndexes,
+        .outputIndexes = outputIndexes,
+        .operandValues = operandValues,
+        .pools = pools,
+    };
+}
+
+bool is_ignored_2(int i) {
+  static std::set<int> ignore = {};
+  return ignore.find(i) != ignore.end();
+}
+
+// Create the model
+Model createTestModel_relaxed_2() {
+    const std::vector<Operand> operands = {
+        {
+            .type = OperandType::TENSOR_FLOAT32,
+            .dimensions = {2, 4, 4, 4},
+            .numberOfConsumers = 1,
+            .scale = 0.0f,
+            .zeroPoint = 0,
+            .lifetime = OperandLifeTime::MODEL_INPUT,
+            .location = {.poolIndex = 0, .offset = 0, .length = 0},
+        },
+        {
+            .type = OperandType::TENSOR_FLOAT32,
+            .dimensions = {2, 4},
+            .numberOfConsumers = 1,
+            .scale = 0.0f,
+            .zeroPoint = 0,
+            .lifetime = OperandLifeTime::MODEL_INPUT,
+            .location = {.poolIndex = 0, .offset = 0, .length = 0},
+        },
+        {
+            .type = OperandType::TENSOR_FLOAT32,
+            .dimensions = {2, 4, 3},
+            .numberOfConsumers = 0,
+            .scale = 0.0f,
+            .zeroPoint = 0,
+            .lifetime = OperandLifeTime::MODEL_OUTPUT,
+            .location = {.poolIndex = 0, .offset = 0, .length = 0},
+        }
+    };
+
+    const std::vector<Operation> operations = {
+        {
+            .type = OperationType::HEATMAP_MAX_KEYPOINT,
+            .inputs = {0, 1},
+            .outputs = {2},
+        }
+    };
+
+    const std::vector<uint32_t> inputIndexes = {0, 1};
+    const std::vector<uint32_t> outputIndexes = {2};
+    std::vector<uint8_t> operandValues = {};
+    const std::vector<hidl_memory> pools = {};
+
+    return {
+        .operands = operands,
+        .operations = operations,
+        .inputIndexes = inputIndexes,
+        .outputIndexes = outputIndexes,
+        .operandValues = operandValues,
+        .pools = pools,
+        .relaxComputationFloat32toFloat16 = true,
+    };
+}
+
+bool is_ignored_relaxed_2(int i) {
+  static std::set<int> ignore = {};
+  return ignore.find(i) != ignore.end();
+}
+
diff --git a/runtime/test/specs/V1_2/heatmap_max_keypoint.mod.py b/runtime/test/specs/V1_2/heatmap_max_keypoint.mod.py
new file mode 100644
index 0000000..0e77c6a
--- /dev/null
+++ b/runtime/test/specs/V1_2/heatmap_max_keypoint.mod.py
@@ -0,0 +1,108 @@
+#
+# Copyright (C) 2018 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# TEST 1: HEATMAP_MAX_KEYPOINT_1
+heatmap1 = Input("heatmap", "TENSOR_FLOAT32", "{6, 4, 4, 1}")
+boxes1 = Input("boxes", "TENSOR_FLOAT32", "{6, 4}")
+o1 = Output("out", "TENSOR_FLOAT32", "{6, 1, 3}")
+Model().Operation("HEATMAP_MAX_KEYPOINT", heatmap1, boxes1).To(o1)
+
+# Instantiate an example
+Example({
+    heatmap1: [
+        -10, -1,  4, -5, # batch0
+         -8, -2,  9,  1,
+          7, -2,  3, -7,
+         -2,  2, -3,  5,
+        -10, -1,  4, -5, # batch1 - test mirror bottom
+         -8, -2,  9,  1,
+          7, -2,  3, -7,
+         -2, 10, -3,  5,
+        -10, -1,  4, -5, # batch2 - test mirror left
+         -8, -2,  4,  1,
+          7, -2,  3, -7,
+         -2,  2, -3,  5,
+        -10, -1,  4, 10, # batch3 - test mirror top right
+         -8, -2,  4,  1,
+          7, -2,  3, -7,
+         -2,  2, -3,  5,
+        -10,-56,  4, -5, # batch4 - test out of range delta
+         -8, -2,  9,  1,
+          7, -2,  3, -7,
+         -2,  2, -3,  5,
+        -10,-57.827329175, 4, -5, # batch5 - test detA = 0
+         -8, -2,  9,  1,
+          7, -2,  3, -7,
+         -2,  2, -3,  5
+    ],
+    boxes1: [
+        5, 2, 10, 20,
+        1, 7, 30, 10,
+        8, 3, 15, 13,
+        6, 5, 19, 12,
+        5, 2, 10, 20,
+        5, 2, 10, 20
+    ],
+    o1: [
+        8.224462, 8.537316, 9.071493,
+        11.73000, 9.625000, 10.00500,
+        8.875000, 9.562500, 7.187500,
+        17.37500, 5.875000, 10.00000,
+        9.569672, 2.000000, 10.689667,
+        8.125000, 8.750000, 9.000000
+    ]
+}).AddVariations("relaxed")
+
+
+# TEST 2: HEATMAP_MAX_KEYPOINT_2
+heatmap2 = Input("heatmap", "TENSOR_FLOAT32", "{2, 4, 4, 4}")
+boxes2 = Input("boxes", "TENSOR_FLOAT32", "{2, 4}")
+o2 = Output("out", "TENSOR_FLOAT32", "{2, 4, 3}")
+Model().Operation("HEATMAP_MAX_KEYPOINT", heatmap2, boxes2).To(o2)
+
+# Instantiate an example
+Example({
+    heatmap2: [
+        0.19, 0.61, 0.49, 0.01, 0.98, 0.65, 0.64, 0.70, 0.76, 0.55,
+        0.83, 0.19, 0.46, 0.03, 0.67, 0.71, 0.17, 0.23, 0.89, 0.08,
+        0.96, 0.65, 0.52, 0.40, 0.36, 0.80, 0.55, 0.89, 0.58, 0.29,
+        0.27, 0.69, 0.66, 0.06, 0.51, 0.26, 0.96, 0.38, 0.41, 0.89,
+        0.88, 0.46, 0.96, 0.73, 0.54, 0.64, 0.84, 0.74, 0.51, 0.41,
+        0.13, 0.19, 0.52, 0.21, 0.50, 0.75, 0.89, 0.89, 0.20, 0.58,
+        0.70, 0.13, 0.29, 0.39,
+        0.91, 0.06, 0.93, 0.34, 0.80, 0.87, 0.59, 0.67, 0.57, 0.85,
+        0.24, 0.25, 0.76, 0.34, 0.37, 0.11, 0.00, 0.29, 0.30, 0.77,
+        0.34, 0.57, 0.48, 0.76, 0.93, 0.18, 0.64, 0.12, 0.67, 0.47,
+        0.56, 0.50, 0.48, 0.99, 0.46, 0.66, 0.98, 0.06, 0.10, 0.66,
+        0.66, 0.91, 0.67, 0.23, 0.40, 0.37, 0.17, 0.35, 0.48, 0.98,
+        0.47, 0.49, 0.56, 0.18, 0.75, 0.29, 0.04, 0.23, 0.42, 0.55,
+        0.38, 0.07, 0.71, 0.80
+    ],
+    boxes2: [
+        5, 2, 10, 20,
+        1, 7, 30, 10
+    ],
+    o2: [
+         7.227723,  4.250000, 1.020210,
+         8.090278, 17.750000, 0.890556,
+         8.523379, 12.589181, 1.007110,
+         8.365580, 10.122508, 0.945129,
+        12.431603,  8.934225, 0.987798,
+         4.625000,  9.239437, 1.073820,
+         4.625000,  7.375000, 0.930000,
+        26.375000,  9.625000, 0.800000
+    ]
+}).AddVariations("relaxed")